<a href="https://colab.research.google.com/github/tkeldenich/Sentences_Embedding_Visualization_TSNE/blob/main/tsne_sentences_embedding_visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **[EN] TSNE - Sentences Embedding Visualization**

 - [Read the Article in english](https://inside-machinelearning.com/en/efficient-sentences-embedding-visualization-tsne/)

# **[FR] TSNE - Sentences Embedding Visualization**
 - [Lire l'article en français](https://inside-machinelearning.com/visualisation-embedding-de-phrases-tsne/)

# **Preparing Data - GoEmotions**

In [None]:
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv &> /dev/null
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv &> /dev/null
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv &> /dev/null

In [None]:
import numpy as np
import pandas as pd

df1 = pd.read_csv('/content/data/full_dataset/goemotions_1.csv')
df2 = pd.read_csv('/content/data/full_dataset/goemotions_2.csv')
df3 = pd.read_csv('/content/data/full_dataset/goemotions_3.csv')
df = pd.concat([df1, df2, df3], sort=False)
df.drop(['id', 'link_id','parent_id','created_utc','rater_id','example_very_unclear'], axis=1, inplace=True)
df.drop_duplicates(subset=['text', 'author', 'subreddit'], inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
len(df)

58009

In [None]:
df.head(1)

Unnamed: 0,text,author,subreddit,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,Brdd9,nrl,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [None]:
df_analysis = df[['text']]

In [None]:
df_analysis['text_emotion'] = df.drop(columns=['text','author','subreddit'], axis=1, inplace=False).idxmax(axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
df_analysis.head(1)

Unnamed: 0,text,text_emotion
0,That game hurt.,sadness


In [None]:
df_analysis = pd.concat([df_analysis[df_analysis['text_emotion'] == 'joy'].iloc[:1000],
                         df_analysis[df_analysis['text_emotion'] == 'sadness'].iloc[:1000],
                         df_analysis[df_analysis['text_emotion'] == 'curiosity'].iloc[:1000],
                         #df_analysis[df_analysis['text_emotion'] == 'admiration'].iloc[:1000],
                         df_analysis[df_analysis['text_emotion'] == 'neutral'].iloc[:1000],
                         df_analysis[df_analysis['text_emotion'] == 'love'].iloc[:1000],
                         #df_analysis[df_analysis['text_emotion'] == 'gratitude'].iloc[:1000],
                         #df_analysis[df_analysis['text_emotion'] == 'disapproval'].iloc[:1000],
                         df_analysis[df_analysis['text_emotion'] == 'amusement'].iloc[:1000],
                         #df_analysis[df_analysis['text_emotion'] == 'disappointment'].iloc[:1000],
                         #df_analysis[df_analysis['text_emotion'] == 'realization'].iloc[:1000],
                         #df_analysis[df_analysis['text_emotion'] == 'caring'].iloc[:1000],
                         #df_analysis[df_analysis['text_emotion'] == 'disgust'].iloc[:1000],
                         #df_analysis[df_analysis['text_emotion'] == 'remorse'].iloc[:1000],
                         #df_analysis[df_analysis['text_emotion'] == 'approval'].iloc[:1000],
                         df_analysis[df_analysis['text_emotion'] == 'embarrassment'].iloc[:1000]
                         ])

# **Transfer Learning - BERT**

In [None]:
import tensorflow_hub as hub

module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
#module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"

bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
!pip install sentencepiece &> /dev/null

In [None]:
import tokenization
import numpy as np

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
train_input = bert_encode(df_analysis.text.values, tokenizer, max_len=100)

train_labels = df_analysis.text_emotion.values

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

def build_model(bert_layer, max_len=512):
    input_word_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]

    flatten = layers.Flatten(name='flatten')
    output_flatten = flatten(clf_output)

    out = layers.Dense(len(np.unique(train_labels)), activation='sigmoid')(output_flatten)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
model = build_model(bert_layer, max_len=100)

  super(Adam, self).__init__(name, **kwargs)


In [None]:
label_dummy = pd.get_dummies(train_labels)

In [None]:
label_dummy.head(2)

Unnamed: 0,amusement,curiosity,embarrassment,joy,love,neutral,sadness
0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0


In [None]:
train_history = model.fit(
    train_input, label_dummy,
    validation_split=0.2,
    epochs=10,
    batch_size=32
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# **Text Embedding Visualization - TSNE**

In [None]:
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer('flatten').output)
sentence_embedded = intermediate_layer_model.predict(train_input)

In [None]:
labels_emotion = df_analysis.text_emotion

In [None]:
sentence_embedded.shape

(6433, 768)

In [None]:
labels_emotion.shape

(6433,)

In [None]:
import numpy as np
from sklearn.manifold import TSNE

X = list(sentence_embedded)

X_embedded = TSNE(n_components=2).fit_transform(X)

In [None]:
df_embeddings = pd.DataFrame(X_embedded)
df_embeddings = df_embeddings.rename(columns={0:'x',1:'y'})
df_embeddings = df_embeddings.assign(label=df_analysis.text_emotion.values)

In [None]:
df_embeddings = df_embeddings.assign(text=df_analysis.text.values)

In [None]:
import plotly.express as px

fig = px.scatter(
    df_embeddings, x='x', y='y',
    color='label', labels={'color': 'label'}
    hover_data=['text'], title = 'GoEmotions Embedding Visualization')
fig.show()

# **Going Further**

In [None]:
df_embeddings['length_text'] = df_embeddings[['text']].applymap(lambda x : len(x))

In [None]:
df_embeddings.head(1)

Unnamed: 0,x,y,label,text,length_text
0,40.34008,-17.021473,joy,By far the coolest thing I've seen on this thr...,53


In [None]:
import plotly.express as px

fig = px.scatter(
    df_embeddings, x='x', y='y',
    color='label', labels={'color': 'label'},
    size = 'length_text', size_max = 10, template = 'simple_white',
    hover_data=['text'], title = 'GoEmotions Embedding Visualization')
fig.show()

In [None]:
import plotly.express as px

fig = px.scatter(
    df_embeddings, x='x', y='y',
    color='label', labels={'color': 'label'},
    size = 'length_text', size_max = 15, template = 'simple_white',
    hover_data=['text'], title = 'GoEmotions Embedding Visualization')
fig.show()