In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE

import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px

# Load the models

- SBERT (For Sentence-based vectors)
- BERT

In [2]:
# Load the pre-trained model (SBERT)
model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

# Load the data

Utteraces: For utterance-based analysis

Dialog Act labels: For dialog act analysis

In [3]:
# Load the utterances and dialog act labels

def load_utterances():
    
    utterances_dict = {}
    
    with open('data_loaders/dailydialog/dialogues_text.txt', encoding='utf-8') as f:
        
        for index, line in enumerate(f):
            
            utterances = line.replace('’',"'").split('__eou__')
            utterances.pop(-1)
            utterances_dict[index] = utterances
    
    return utterances_dict
    
def load_labels():
    labels_dict = {}
    
    with open('data_loaders/dailydialog/dialogues_act.txt') as f:
        for index, line in enumerate(f):
            labels_dict[index] = line.strip().split(' ')
    
    return labels_dict

utterances_list = load_utterances()
dialog_labels = load_labels()

# Utterance Features visualization

- **Input**: Sentence
- **Output**: Feature Vector
- **Visualize**: T-SNE + PCA + Scatter Plot

In [4]:
# Convert to Dataframe

# First extract all the sentences
# and the corresponding labels
utterances = []
da_labels = []

for key, utter_list in utterances_list.items():
    
    # Get the DA labels
    da_label_list = dialog_labels[key]
    
    utterances += [utterance for utterance in utter_list]
    da_labels += [label for label in da_label_list]

# Then store in a dictionary before conversion
data_extraction = {
    'utterance': utterances,
    'dialog_act': da_labels
}

dialog_dataframe = pd.DataFrame.from_dict(data_extraction)

In [5]:
# Extract N sentences (Trial run)
N = 10000

sentences = utterances[:N]
act_labels = da_labels[:N]
print(len(sentences))

# Convert to sentence embeddings
sentence_embeddings = model.encode(sentences, show_progress_bar=True)

10000


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [6]:
tsne = TSNE(
    n_components=2, random_state=42, 
    init='pca',
    learning_rate=200
)


tsne_embeddings = tsne.fit_transform(sentence_embeddings)



In [7]:
# Convert DA labels to appropriate colors
color_conversion = {
    '1': '#c2622f',
    '2': '#5482cc',
    '3': '#72bd35',
    '4': '#8035bd'
}

label_mapping = {
    '1': 'inform', 
    '2': 'question', 
    '3': 'directive', 
    '4': 'commissive'
}

colors = [color_conversion[label] for label in act_labels]
full_mapping = [label_mapping[label] for label in act_labels]

dataframe = pd.DataFrame.from_dict({'da_label': full_mapping})

In [8]:
data = []

for num, color in color_conversion.items():
    
    label = label_mapping[num]
    
    legend_obj = go.Scatter(
        x=[None],
        y=[None],
        mode="markers",
        name=label,
        marker=dict(size=7, color=color, symbol='circle'),
    )
    
    data.append(legend_obj)

scatter_trace = go.Scatter(
    customdata=np.stack(dataframe['da_label'], axis=-1),
    x = tsne_embeddings[:, 0],
    y = tsne_embeddings[:, 1],
    mode = 'markers',
    name = '',
    marker = dict(
        size = 10,
        color=colors,
    ),
    hovertemplate='Text: %{text}<br>Dialog Act: %{customdata}<extra></extra>',
    text = sentences,
    showlegend=False
)

#data = [scatter_trace]
data.append(scatter_trace)

In [11]:
layout = dict(
    title = 'SBERT Visualization',
    xaxis = dict(title='Dimension 1', showgrid=False, visible=False),
    yaxis = dict(title='Dimension 2', showgrid=False, visible=False),
    hovermode = 'closest'
)

In [12]:
fig = go.Figure(data=data, layout=layout)
fig.update_layout(template='simple_white')
fig.show()

# Find the context window

1. Load the dialog act labels &check;
2. Get the length of every conversation
3. Plot the visuals

Visual information can include:
- Histogram

In [None]:
conversation_length = [len(dialog_list) for key, dialog_list in dialog_labels.items()]

In [None]:
fig = px.histogram(conversation_length)

fig.update_layout(
    bargap=0.5, hovermode = 'x',
    xaxis_title="Conversation Length",
    yaxis_title="Counts"
)

fig.update_traces(hovertemplate='Conversation length of %{x} has %{y} occurrences')

fig.show()

In [None]:
# create the bins
counts, bins = np.histogram(conversation_length, bins=range(0, 40, 5))
bins = 0.5 * (bins[:-1] + bins[1:])

fig = px.histogram(x=bins, y=counts)

fig.update_layout(
    bargap=0.5, hovermode = 'x',
    xaxis_title="Conversation Length (Range)",
    yaxis_title="Counts"
)

fig.show()