In [4]:
import tensorflow as tf
import tensorflow_hub as hub
from transformers import BertTokenizer

In [18]:
"""Question Answering"""

def question_answer(question, reference):
    """Finds a snippet of text within a reference document to answer
        a question:
        question is a string containing the question to answer.
        reference is a string containing the reference document from which to
            find the answer.
        Returns: a string containing the answer.
        If no answer is found, return None.
        Your function should use the bert-uncased-tf2-qa model from the
            tensorflow-hub library.
        Your function should use the pre-trained BertTokenizer,
            bert-large-uncased-whole-word-masking-finetuned-squad,
            from the transformers library."""
    model = hub.load('https://tfhub.dev/see--/bert-uncased-tf2-qa/1')
    tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')

    input_ids = tokenizer.encode(question, reference)
    input_mask = [1] * len(input_ids)
    input_type_ids = [0 if i < input_ids.index(102) else 1
                      for i in range(len(input_ids))]

    input_ids = tf.constant([input_ids])
    input_mask = tf.constant([input_mask])
    input_type_ids = tf.constant([input_type_ids])

    outputs = model([input_ids, input_mask, input_type_ids])
    start_index = tf.argmax(outputs[0][0][1:]) + 1
    end_index = tf.argmax(outputs[1][0][1:]) + 1

    answer_tokens = tokenizer.convert_ids_to_tokens(
        input_ids[0][start_index:end_index])
    answer = tokenizer.convert_tokens_to_string(answer_tokens)

    if answer == '[CLS]' or answer == '[SEP]':
        return None

    return answer

In [19]:
file_url = '/content/drive/MyDrive/Colab Notebooks/data'

In [20]:
# question_answer = __import__('0-qa').question_answer

with open('/content/drive/MyDrive/Colab Notebooks/data/ZendeskArticles/PeerLearningDays.md') as f:
    reference = f.read()

print(question_answer('When are PLDs?', reference))

on - site days from 9 : 00 am to 3 : 00


In [1]:
"""Create the loop"""

while True:
    user_input = input('Q: ')
    if user_input.lower() in ['exit', 'quit', 'goodbye', 'bye']:
        print('A: Goodbye')
        break
    else:
        print('A: ')


Q: a
A: 
Q: hdeohoe
A: 
Q: How
A: 
Q: exit
A: Goodbye


In [21]:
"""Answer Questions"""

def answer_loop(reference):
    """Answers questions from a reference text:
        reference is the reference text.
        If the answer cannot be found in the reference text,
        respond with Sorry, I do not understand your question."""
    model = hub.load('https://tfhub.dev/see--/bert-uncased-tf2-qa/1')
    tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')

    while True:
        question = input('Q: ')
        if question.lower() in ["exit", "quit", "goodbye", "bye"]:
            print("A: Goodbye")
            break

        input_ids = tokenizer.encode(question, reference)
        input_mask = [1] * len(input_ids)
        input_type_ids = [0 if i < input_ids.index(102) else 1
                        for i in range(len(input_ids))]

        input_ids = tf.constant([input_ids])
        input_mask = tf.constant([input_mask])
        input_type_ids = tf.constant([input_type_ids])

        outputs = model([input_ids, input_mask, input_type_ids])
        start_index = tf.argmax(outputs[0][0][1:]) + 1
        end_index = tf.argmax(outputs[1][0][1:]) + 1

        answer_tokens = tokenizer.convert_ids_to_tokens(
            input_ids[0][start_index:end_index])
        answer = tokenizer.convert_tokens_to_string(answer_tokens)

        if answer:
            print('A: ' + answer)
        else:
            print('A: Sorry, I do not understand your question.')

In [65]:
# answer_loop = __import__('2-qa').answer_loop

with open('/content/drive/MyDrive/Colab Notebooks/data/ZendeskArticles/PeerLearningDays.md') as f:
    reference = f.read()

answer_loop(reference)

Q: what are mock interviews?
A: Sorry, I do not understand your question.
Q: what are plds?
A: a time for you and your peers to ensure that each of you understands the concepts you ' ve encountered in your projects , as well as a time for everyone to collectively grow in technical , professional , and soft
Q: when are plds?
A: on - site days from 9 : 00 am to 3 : 00
Q: what does PLD stand for?
A: peer learning
Q: exit
A: Goodbye


In [5]:
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
"""Semantic Search"""

def semantic_search(corpus_path, sentence):
    """Performs semantic search on a corpus of documents:
        corpus_path is the path to the corpus of reference documents on which
            to perform semantic search.
        sentence is the sentence from which to perform semantic search.
        Returns: the reference text of the document most similar to
            sentence."""
    model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    embed = hub.load(model_url)

    corpus_sentences = []
    for filename in os.listdir(corpus_path):
        if filename.endswith('.md'):
            with open(os.path.join(
                    corpus_path, filename), 'r', encoding='utf-8') as file:
                document_text = file.read()
                corpus_sentences.append(document_text)

    corpus_embeddings = embed(corpus_sentences)
    query_embedding = embed([sentence])

    similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]

    most_similar_index = similarities.argmax()

    most_similar_document_path = os.path.join(corpus_path, os.listdir(
         corpus_path)[most_similar_index])

    with open(most_similar_document_path, 'r', encoding='utf-8') as file:
        most_similar_document_text = file.read()

    return most_similar_document_text

In [26]:
# semantic_search = __import__('3-semantic_search').semantic_search

print(semantic_search('/content/drive/MyDrive/Colab Notebooks/data/ZendeskArticles', 'When are PLDs?'))

At the beginning of every trimester, each cohort will gather together in small groups for a 30m Roundtable conversation to kick off the trimester together. 
Objective
The Roundtables create small support groups within the cohort and gives the supporting staff an opportunity to get to know the students beyond their technical project-work.
Scheduling
The Roundtables will be defined by the first PLD groups (with adjustments for absences). The groups meetings will be planned within the first couple of weeks in the trimester. They may span a couple days if there are more groups.
 
Trimester 1
Each person will be asked to introduce themselves by:
Sharing a bit about their background
Offer a way that they can help people
Ask for a way they can be supported
Trimester 2 & 3
Each person will be asked to share:
Their experience of the past trimester
What their personal goals are for the next trimester (or future)
Whether they have new offers or asks of their group


In [27]:
"""Multi-reference Question Answering"""
def question_answer(corpus_path):
    """Answers questions from multiple reference texts:
        corpus_path is the path to the corpus of reference documents."""
    model = hub.load('https://tfhub.dev/see--/bert-uncased-tf2-qa/1')
    tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad')

    while True:
        question = input('Q: ')
        if question.lower() in ["exit", "quit", "goodbye", "bye"]:
            print("A: Goodbye")
            break

        reference = semantic_search(corpus_path, question)
        input_ids = tokenizer.encode(question, reference)
        input_mask = [1] * len(input_ids)
        input_type_ids = [0 if i < input_ids.index(102) else 1
                        for i in range(len(input_ids))]

        input_ids = tf.constant([input_ids])
        input_mask = tf.constant([input_mask])
        input_type_ids = tf.constant([input_type_ids])

        outputs = model([input_ids, input_mask, input_type_ids])
        start_index = tf.argmax(outputs[0][0][1:]) + 1
        end_index = tf.argmax(outputs[1][0][1:]) + 1

        answer_tokens = tokenizer.convert_ids_to_tokens(
            input_ids[0][start_index:end_index])
        answer = tokenizer.convert_tokens_to_string(answer_tokens)

        if answer:
            print('A: ' + answer)
        else:
            print('A: Sorry, I do not understand your question.')

In [28]:
# question_answer = __import__('4-qa').question_answer

question_answer('/content/drive/MyDrive/Colab Notebooks/data/ZendeskArticles')

Q: When are PLDs?
A: at the beginning of every trim
Q: What are Mock Interviews?
A: a mock interview has a designated length , each mock interview topic also has its own designated
Q: What does PLD stand for?
A: peer learning
Q: goodbye
A: Goodbye
