In [7]:
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE

import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px

# Load the models

- SBERT (For Sentence-based vectors)
- BERT

In [2]:
# Load the pre-trained model (SBERT)
model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

# Load the data

Utteraces: For utterance-based analysis

Dialog Act labels: For dialog act analysis

In [3]:
# Load the utterances and dialog act labels

def load_utterances():
    
    utterances_dict = {}
    
    with open('data_loaders/dailydialog/dialogues_text.txt', encoding='utf-8') as f:
        
        for index, line in enumerate(f):
            
            utterances = line.replace('â€™',"'").split('__eou__')
            utterances.pop(-1)
            utterances_dict[index] = utterances
    
    return utterances_dict
    
def load_labels():
    labels_dict = {}
    
    with open('data_loaders/dailydialog/dialogues_act.txt') as f:
        for index, line in enumerate(f):
            labels_dict[index] = line.strip().split(' ')
    
    return labels_dict

utterances_list = load_utterances()
dialog_labels = load_labels()

# Utterance Features visualization

- **Input**: Sentence
- **Output**: Feature Vector
- **Visualize**: T-SNE + PCA + Scatter Plot

In [4]:
# Convert to Dataframe

# First extract all the sentences
# and the corresponding labels
utterances = []
da_labels = []

for key, utter_list in utterances_list.items():
    
    # Get the DA labels
    da_label_list = dialog_labels[key]
    
    utterances += [utterance for utterance in utter_list]
    da_labels += [label for label in da_label_list]

# Then store in a dictionary before conversion
data_extraction = {
    'utterance': utterances,
    'dialog_act': da_labels
}

dialog_dataframe = pd.DataFrame.from_dict(data_extraction)

In [None]:
def convert_sentences(sentence):
    return model.encode(sentences)

dialog_dataframe['feature_vectors'] = dialog_dataframe['utterance'].map(lambda x: model.encode(x))

print(dialog_dataframe)

In [None]:
perplexity = 2

tsne = TSNE(
    n_components=2, random_state=42, 
    perplexity = perplexity, init='pca',
    learning_rate=200
)


tsne_embeddings = tsne.fit_transform(sentence_embeddings)

In [None]:
scatter_trace = go.Scatter(
    x = tsne_embeddings[:, 0],
    y = tsne_embeddings[:, 1],
    mode = 'markers',
    marker = dict(
        size = 10,
        color = ['red', 'green', 'blue']
    ),
    text = sentences
)

data = [scatter_trace]

In [None]:
layout = dict(
    title = 'SBERT Visualization',
    xaxis = dict(title='Dimesion 1'),
    yaxis = dict(title='Dimension 2'),
    hovermode = 'closest'
)

In [None]:
fig = go.Figure(data=data, layout=layout)
fig.show()

# Find the context window

1. Load the dialog act labels &check;
2. Get the length of every conversation
3. Plot the visuals

Visual information can include:
- Histogram

In [16]:
conversation_length = [len(dialog_list) for key, dialog_list in dialog_labels.items()]
print(conversation_length)

[2, 5, 4, 4, 3, 9, 4, 9, 2, 5, 4, 6, 4, 6, 2, 6, 6, 4, 4, 7, 2, 4, 8, 8, 4, 4, 4, 4, 4, 2, 2, 2, 2, 6, 8, 8, 16, 4, 2, 4, 8, 8, 8, 2, 4, 6, 2, 10, 6, 4, 2, 4, 2, 2, 2, 4, 8, 4, 5, 4, 8, 2, 5, 2, 8, 2, 6, 10, 5, 2, 6, 5, 2, 8, 2, 4, 2, 6, 4, 2, 10, 4, 2, 4, 2, 9, 2, 4, 2, 4, 2, 2, 4, 2, 2, 7, 2, 8, 4, 8, 2, 4, 4, 2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 4, 2, 4, 8, 2, 4, 4, 16, 4, 2, 4, 4, 2, 4, 2, 4, 4, 4, 4, 2, 2, 2, 10, 2, 4, 4, 4, 6, 2, 9, 2, 2, 2, 2, 2, 2, 2, 7, 4, 2, 6, 4, 6, 4, 16, 2, 2, 10, 10, 2, 2, 10, 16, 5, 5, 11, 11, 2, 2, 2, 2, 2, 10, 5, 2, 10, 6, 2, 5, 5, 5, 7, 5, 2, 6, 5, 5, 2, 6, 5, 2, 2, 6, 5, 6, 2, 11, 2, 2, 9, 2, 8, 9, 2, 2, 2, 2, 5, 8, 5, 2, 7, 6, 9, 2, 2, 2, 10, 6, 6, 2, 5, 5, 7, 6, 8, 10, 2, 6, 5, 6, 6, 12, 7, 9, 6, 8, 10, 2, 2, 4, 2, 4, 4, 4, 4, 4, 2, 6, 4, 2, 7, 9, 11, 9, 10, 10, 7, 11, 6, 10, 7, 17, 2, 7, 2, 13, 2, 12, 10, 12, 10, 9, 8, 3, 4, 8, 8, 6, 8, 13, 12, 2, 3, 8, 3, 8, 8, 12, 7, 4, 2, 8, 8, 8, 9, 8, 4, 8, 8, 8, 8, 11, 12, 11, 13, 17, 

In [23]:
fig = px.histogram(conversation_length)

fig.update_layout(
    bargap=0.5, hovermode = 'x',
    xaxis_title="Conversation Length",
    yaxis_title="Counts"
)

fig.update_traces(hovertemplate='Conversation length of %{x} has %{y} occurrences')

fig.show()

In [20]:
# create the bins
counts, bins = np.histogram(conversation_length, bins=range(0, 40, 5))
bins = 0.5 * (bins[:-1] + bins[1:])

fig = px.histogram(x=bins, y=counts)

fig.update_layout(
    bargap=0.5, hovermode = 'x',
    xaxis_title="Conversation Length (Range)",
    yaxis_title="Counts"
)

fig.show()