In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE

import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px

In [2]:
# Miscellaneous code

label_mapping = {
    '1': 'inform', 
    '2': 'question', 
    '3': 'directive', 
    '4': 'commissive'
}

# Load the data

Utteraces: For utterance-based analysis

Dialog Act labels: For dialog act analysis

In [3]:
# Load the utterances and dialog act labels

def load_utterances():
    
    utterances_dict = {}
    
    with open('data_loaders/dailydialog/dialogues_text.txt', encoding='utf-8') as f:
        
        for index, line in enumerate(f):
            
            utterances = line.replace('’',"'").split('__eou__')
            utterances.pop(-1)
            utterances_dict[index] = utterances
    
    return utterances_dict
    
def load_labels():
    labels_dict = {}
    
    with open('data_loaders/dailydialog/dialogues_act.txt') as f:
        for index, line in enumerate(f):
            labels_dict[index] = line.strip().split(' ')
    
    return labels_dict

utterances_list = load_utterances()
dialog_labels = load_labels()

# Case Study

- A question reflected as either question-act or inform-act
- 'Thanks .' reflected as either inform or commissive

In [13]:
def display_full_conversation(utterance_list, dialog_labels, label_mapping):
    print(f'Conversation length: {len(utterance_list)}')
    for utterance, dialog_label in zip(utterance_list, dialog_labels):
        dialog_label = label_mapping[dialog_label]
        
        print(f'{utterance} - (label: {dialog_label})')
    
    print("\n")
    
    return None

In [16]:
misleading_question_labels = 0

for index, utterance_list in utterances_list.items():
    
    labels_list = dialog_labels[index]
    
    for dialog_label, utterance in zip(labels_list, utterance_list):
        dialog_label = label_mapping[dialog_label]
        
        # Case Study: Question can mean two things
        # Find utterances where the dialog act isn't a question
        # but the utterance implies a question
        if utterance[-2] == '?' and dialog_label != 'question':
            misleading_question_labels += 1
            
            print(f'Index number: {index}')
            display_full_conversation(utterance_list, labels_list, label_mapping)

Index number: 1
Conversation length: 5
So Dick , how about getting some coffee for tonight ?  - (label: directive)
 Coffee ? I don ' t honestly like that kind of stuff .  - (label: commissive)
 Come on , you can at least try a little , besides your cigarette .  - (label: directive)
 What ' s wrong with that ? Cigarette is the thing I go crazy for .  - (label: inform)
 Not for me , Dick .  - (label: inform)


Index number: 3
Conversation length: 4
Would you mind waiting a while ?  - (label: directive)
 Well , how long will it be ?  - (label: question)
 I'm not sure . But I'll get a table ready as fast as I can .  - (label: inform)
 OK . We'll wait .  - (label: inform)


Index number: 5
Conversation length: 9
Isn ' t he the best instructor ? I think he ' s so hot . Wow ! I really feel energized , don ' t you ?  - (label: question)
 I swear , I ' m going to kill you for this .  - (label: inform)
 What ' s wrong ? Didn ' t you think it was fun ? !  - (label: question)
 Oh , yeah ! I had a 

In [17]:
context_thanks_labels = 0

for index, utterance_list in utterances_list.items():
    
    labels_list = dialog_labels[index]
    
    for dialog_label, utterance in zip(labels_list, utterance_list):
        dialog_label = label_mapping[dialog_label]
        
        # Case Study: ' Thanks .' can mean different
        # Find utterances where the it is just ' Thanks .'
        if utterance == ' Thanks . ':
            context_thanks_labels += 1
            
            print(f'Index number: {index}')
            display_full_conversation(utterance_list, labels_list, label_mapping)

Index number: 12
Conversation length: 4
Can I help you sir , what do you need ?  - (label: question)
 I need a packet of cigarettes please .  - (label: directive)
 Of course sir , no problem .  - (label: commissive)
 Thanks .  - (label: inform)


Index number: 41
Conversation length: 8
May I help you ?  - (label: question)
 Give me six-piece chicken nuggets , a large fries , and a large coke .  - (label: directive)
 You will need to wait a few minutes for fries . They're still in the fryer .  - (label: commissive)
 That's fine .  - (label: inform)
 Your total comes to 7 dollars .  - (label: inform)
 Here's 20 .  - (label: inform)
 Thank you . Your cash back is 13 dollars . Pull into a parking states , and we'll bring you your fries in two minutes .  - (label: directive)
 Thanks .  - (label: commissive)


Index number: 249
Conversation length: 8
Do you carrying fresh fish ?  - (label: question)
 Yes we do . Check the meat department . There is a fish counter there too .  - (label: infor

In [18]:
print(f'Misleading labels for questions occurrences: {misleading_question_labels}')
print(f'Misleading labels for thankful occurrences: {context_thanks_labels}')

Misleading labels for questions occurrences: 6108
Misleading labels for thankful occurrences: 127


# Utterance Features visualization

- **Input**: Sentence
- **Output**: Feature Vector
- **Visualize**: T-SNE + PCA + Scatter Plot

Did you know:

- Time to process all sentences: roughly 36 minutes (Save it!)

In [None]:
# Load the pre-trained model (SBERT)
model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

# Load the models

- SBERT (For Sentence-based vectors)
- BERT

In [None]:
# Convert to Dataframe

# First extract all the sentences
# and the corresponding labels
utterances = []
da_labels = []

for key, utter_list in utterances_list.items():
    
    # Get the DA labels
    da_label_list = dialog_labels[key]
    
    utterances += [utterance for utterance in utter_list]
    da_labels += [label for label in da_label_list]

# Then store in a dictionary before conversion
data_extraction = {
    'utterance': utterances,
    'dialog_act': da_labels
}

dialog_dataframe = pd.DataFrame.from_dict(data_extraction)

In [None]:
# Extract N sentences (Trial run)
#N = 10000

sentences = utterances
act_labels = da_labels
print(len(sentences))

# Convert to sentence embeddings
sentence_embeddings = model.encode(sentences, show_progress_bar=True)

In [None]:
tsne = TSNE(
    n_components=2, random_state=42, 
    init='pca',
    learning_rate=200
)


tsne_embeddings = tsne.fit_transform(sentence_embeddings)

In [None]:
# Convert DA labels to appropriate colors
color_conversion = {
    '1': '#c2622f',
    '2': '#5482cc',
    '3': '#72bd35',
    '4': '#8035bd'
}

colors = [color_conversion[label] for label in act_labels]
full_mapping = [label_mapping[label] for label in act_labels]

dataframe = pd.DataFrame.from_dict({'da_label': full_mapping})

In [None]:
data = []

for num, color in color_conversion.items():
    
    label = label_mapping[num]
    
    legend_obj = go.Scatter(
        x=[None],
        y=[None],
        mode="markers",
        name=label,
        marker=dict(size=7, color=color, symbol='circle'),
    )
    
    data.append(legend_obj)

scatter_trace = go.Scatter(
    customdata=np.stack(dataframe['da_label'], axis=-1),
    x = tsne_embeddings[:, 0],
    y = tsne_embeddings[:, 1],
    mode = 'markers',
    name = '',
    marker = dict(
        size = 10,
        color=colors,
    ),
    hovertemplate='Text: %{text}<br>Dialog Act: %{customdata}<extra></extra>',
    text = sentences,
    showlegend=False
)

#data = [scatter_trace]
data.append(scatter_trace)

In [None]:
layout = dict(
    title = 'SBERT Visualization',
    xaxis = dict(title='Dimension 1', showgrid=False, visible=False),
    yaxis = dict(title='Dimension 2', showgrid=False, visible=False),
    hovermode = 'closest'
)

In [None]:
fig = go.Figure(data=data, layout=layout)
fig.update_layout(template='simple_white')
fig.show()

In [None]:
fig.write_html('full_tsne_dailydialog.html')

# Find the context window

1. Load the dialog act labels &check;
2. Get the length of every conversation
3. Plot the visuals

Visual information can include:
- Histogram

In [None]:
conversation_length = [len(dialog_list) for key, dialog_list in dialog_labels.items()]

In [None]:
fig = px.histogram(conversation_length)

fig.update_layout(
    bargap=0.5, hovermode = 'x',
    xaxis_title="Conversation Length",
    yaxis_title="Counts"
)

fig.update_traces(hovertemplate='Conversation length of %{x} has %{y} occurrences')

fig.show()

In [None]:
# create the bins
counts, bins = np.histogram(conversation_length, bins=range(0, 40, 5))
bins = 0.5 * (bins[:-1] + bins[1:])

fig = px.histogram(x=bins, y=counts)

fig.update_layout(
    bargap=0.5, hovermode = 'x',
    xaxis_title="Conversation Length (Range)",
    yaxis_title="Counts"
)

fig.show()