
Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Explore Duplicate Question Matches
Use this dashboard to explore the relationship between duplicate and original questions.

## Setup
This section loads needed packages, and defines useful functions.

In [None]:
from __future__ import print_function

import math

import ipywidgets as widgets
import pandas as pd
import requests
from azure_utils.configuration.project_configuration import ProjectConfiguration
from azure_utils.machine_learning.utils import get_workspace_from_config
from azure_utils.utilities import read_questions, text_to_json
from azureml.core.webservice import AksWebservice

from notebooks import directory

In [None]:
ws = get_workspace_from_config()
print(ws.name, ws.resource_group, ws.location, sep="\n")

In [None]:
project_configuration = ProjectConfiguration("project.yml")
aks_service_name = project_configuration.get_settings('aks_service_name')
aks_service = AksWebservice(ws, name=aks_service_name)


Load the duplicate questions scoring app's URL.

In [None]:
scoring_url = aks_service.scoring_uri
api_key = aks_service.get_keys()[0]

A constructor function for ID-text contents. Constructs buttons and text areas for each text ID and text passage.
* Each buttons's description is set to a text's ID, and its click action is set to the handler.
* Each text area's content is set to a text.
* A dictionary is created to map IDs to text areas.

In [None]:
def buttons_and_texts(data,
                      text_id,
                      answerid,
                      text,
                      handle_click,
                      layout=widgets.Layout(width="100%"),
                      num=15):
    """Construct buttons, text areas, and a mapping from IDs to text areas."""
    items = []
    text_map = {}
    for i in range(min(num, len(data))):
        button = widgets.Button(description=data.iloc[i][text_id])
        button.answerid = data.iloc[i][answerid] if answerid in data else None
        button.open = False
        button.on_click(handle_click)
        items.append(button)
        text_area = widgets.Textarea(data.iloc[i][text],
                                     placeholder=data.iloc[i][id],
                                     layout=layout)
        items.append(text_area)
        text_map[data.iloc[i][id]] = text_area
    return items, text_map

A constructor function for the duplicates and questions explorer widget. This builds a box containing duplicates and 
question tabs, each in turn containing boxes that contain the buttons and text areas.

In [None]:
def duplicates_questions_widget( layout=widgets.Layout(width="100%")):
    """Construct a duplicates and questions exploration widget."""
    # Construct the duplicates Tab of buttons and text areas.
    duplicates_items, duplicates_map_inner = buttons_and_texts(
        duplicates,
        duplicates_id,
        duplicates_answerid,
        duplicates_click
    )
    duplicates_tab = widgets.Tab(
        [widgets.VBox(duplicates_items, layout=layout)],
        layout=widgets.Layout(width="100%", height="500px", overflow_y="auto"),
    )
    duplicates_tab.set_title(0, duplicates_title)
    # Construct the questions Tab of buttons and text areas.
    questions_items, questions_map_inner = buttons_and_texts(
        questions,
        questions_id,
        questions_answerid,
        questions_text,
        questions_click
    )
    questions_tab = widgets.Tab(
        [widgets.VBox(questions_items, layout=layout)],
        layout=widgets.Layout(width="100%", height="500px", overflow_y="auto"),
    )
    questions_tab.set_title(0, questions_title)
    # Put both tabs in an HBox.
    duplicates_questions_inner = widgets.HBox([duplicates_tab, questions_tab], layout=layout)
    return duplicates_map_inner, questions_map_inner, duplicates_questions_inner

A handler function for a question passage button press. If the passage's text window is open, it is collapsed. 
Otherwise, it is opened.

In [None]:
def questions_click(button):
    """Respond to a click on a question button."""
    global questions_map
    if button.open:
        questions_map[button.description].rows = None
        button.open = False
    else:
        questions_map[button.description].rows = 10
        button.open = True

A handler function for a duplicate obligation button press. If the obligation is not selected, select it and update 
the questions tab with its top 15 question passages ordered by match score. Otherwise, if the duplicate's text window 
is open, it is collapsed, else it is opened.

In [None]:
def duplicates_click(button):
    """Respond to a click on a duplicate button."""
    global duplicates_map
    if select_duplicate(button):
        duplicates_map[button.description].rows = 10
        button.open = True
    else:
        if button.open:
            duplicates_map[button.description].rows = None
            button.open = False
        else:
            duplicates_map[button.description].rows = 10
            button.open = True


def select_duplicate(button):
    """Update the displayed questions to correspond to the button's duplicate
    selections. Returns whether or not the selected duplicate changed.
    """
    global selected_button, questions_map, duplicates_questions
    if "selected_button" not in globals() or button != selected_button:
        if "selected_button" in globals():
            selected_button.style.button_color = None
            selected_button.style.font_weight = ""
        selected_button = button
        selected_button.style.button_color = "yellow"
        selected_button.style.font_weight = "bold"
        duplicates_text = duplicates_map[selected_button.description].value
        questions_scores = score_text(duplicates_text)
        ordered_questions = questions.loc[questions_scores[questions_id]]
        questions_items, questions_map = buttons_and_texts(
            ordered_questions,
            questions_id,
            questions_answerid,
            questions_text,
            questions_click
        )
        if questions_button_color is True and selected_button.answerid is not None:
            set_button_color(questions_items[::2], selected_button.answerid)
        if questions_button_score is True:
            questions_items = [
                item for button, text_area in zip(*[iter(questions_items)] * 2)
                for item in (add_button_prob(button, questions_scores),
                             text_area)
            ]
        duplicates_questions.children[1].children[0].children = questions_items
        duplicates_questions.children[1].set_title(0,
                                                   selected_button.description)
        return True
    return False


def add_button_prob(button, questions_scores):
    """Return an HBox containing button and its probability."""
    button_id = button.description
    prob = widgets.Label(score_label + ": " + str(
        int(
            math.ceil(score_scale *
                      questions_scores.loc[button_id][questions_probability]))))
    return widgets.HBox([button, prob])


def set_button_color(button, answerid):
    """Set each button's color according to its label."""
    for a_button in button:
        a_button.style.button_color = ("lightgreen" if a_button.answerid == answerid else None)

Functions for interacting with the web service.

In [None]:
def score_text(text):
    """Return a data frame with the original question scores for the text."""
    headers = {
        "content-type": "application/json",
        "Authorization": ("Bearer " + api_key),
    }
    jsontext = text_to_json(text)
    result = requests.post(scoring_url, data=jsontext, headers=headers)
    scores = result.json()
    scores_df = pd.DataFrame(
        scores,
        columns=[questions_id, questions_answerid, questions_probability])
    scores_df[questions_id] = scores_df[questions_id].astype(str)
    scores_df[questions_answerid] = scores_df[questions_answerid].astype(str)
    scores_df = scores_df.set_index(questions_id, drop=False)
    return scores_df

Control the appearance of cell output boxes.

In [None]:
%%html
<style>
.output_wrapper, .output {
    height:auto !important;
    max-height:1000px;  /* your desired max-height here */
}
.output_scroll {
    box-shadow:none !important;
    webkit-box-shadow:none !important;
}
</style>

## Load data

Load the pre-formatted text of questions.

In [None]:
questions_title = 'Questions'
questions_id = 'Id'
questions_answerid = 'AnswerId'
questions_text = 'Text'
questions_probability = 'Probability'
questions_path = directory + '/data_folder/questions.tsv'
questions = read_questions(questions_path, questions_id, questions_answerid)

Load the pre-formatted text of duplicates.

In [None]:
duplicates_title = 'Duplicates'
duplicates_id = 'Id'
duplicates_answerid = 'AnswerId'
duplicates_path = directory + '/data_folder/dupes_test.tsv'
duplicates = read_questions(duplicates_path, duplicates_id, duplicates_answerid)

## Explore original questions matched up with duplicate questions

Define other variables and settings used in creating the interface.

In [None]:
questions_display = 15
questions_button_color = True
questions_button_score = True
score_label = 'Score'
score_scale = 100

This builds the exploration widget as a box containing duplicates and question tabs, each in turn containing boxes 
that have for each ID-text pair a button and a text area.

In [None]:
duplicates_map, questions_map, duplicates_questions = duplicates_questions_widget(duplicates, questions)
duplicates_questions

To tear down the cluster and related resources go to the [last notebook](08_TearDown.ipynb).