In [1]:
import numpy as np
import re
import requests
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BertTokenizer, BertModel
import torch
import spacy
import nltk
from nltk.corpus import stopwords
import time
from nltk.tokenize import word_tokenize
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
from nltk.corpus import wordnet as wn

In [2]:
nltk.download('punkt')
nltk.download('wordnet')

nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def get_pos_weights(word_pos):
    if word_pos == "NOUN" or word_pos == "PROPN":
        return 2.0
    elif word_pos == "VERB":
        return 1.0
    elif word_pos == "ADJ":
        return 0.5
    else:
        return 0.0

In [6]:
def get_relatedness(word1, word2):
    url = f"http://api.conceptnet.io/relatedness?node1=/c/en/{word1}&node2=/c/en/{word2}"
    response = requests.get(url)
    if(response.status_code==200):
        relatedness = response.json().get("value")
        return relatedness
    print(word1, word2)
    return 0


# selecting keyword for response

def next_keyword(candidate_kws, target_kw):
    max_relatedness = -1
    next_kw = ""
    for word in candidate_kws:
        curr_relatedness = get_relatedness(word, target_kw)

        if(curr_relatedness>max_relatedness):
            max_relatedness = curr_relatedness
            next_kw = word
        if(max_relatedness == 1):
            break

        # sleep was introduced considering api request frequency limitations
        time.sleep(0.8)

    return next_kw

In [7]:
# extracting keyword from user statement using part of speech tagging and relatedness value from conceptnet.

def get_keyword(utterance, target_kw):
    doc = nlp(utterance)
    word_pos_tags = [(token.text, token.pos_, token.lemma_) for token in doc if token.pos_ == 'NOUN' or token.pos_ == 'VERB' or token.pos_ == 'ADJ'or token.pos_ == 'PROPN']
    print(word_pos_tags)
    final_scores = [get_pos_weights(pos) for word, pos, lemma in word_pos_tags]

    if(not final_scores):
        return ""

    keywords = [word_pos_tags[i][2] for i in range(len(word_pos_tags)) if final_scores[i] == max(final_scores)]

    return next_keyword(keywords, target_kw)

In [8]:
# get_keyword("India is a great country", "Ganga")

[('India', 'PROPN', 'India'), ('great', 'ADJ', 'great'), ('country', 'NOUN', 'country')]


'India'

In [9]:
# finding all (single worded) directly connected nodes to the given word from conceptnet

def get_related_keywords_hop1(kw):
    related_kws = set()
    related_kws.add(kw)

    offset = 0
    while(1):
        url = f"http://api.conceptnet.io/c/en/{kw}?limit=1000&offset={offset}"

        response = requests.get(url)

        if response.status_code == 200:
            data = response.json()
            edges = data.get("edges", [])

            if(not edges):
                break

            for edge in edges:
                start_word = edge.get("start", {}).get("label")
                start_lang = edge.get("start", {}).get("language", "")

                end_word = edge.get("end", {}).get("label")
                end_lang = edge.get("end", {}).get("language", "")

                if start_lang != "en" or end_lang != "en":
                    continue
                connected_word = start_word if start_word!=kw else end_word

                if(len(re.split(' |-', connected_word)) != 1):
                    continue

                if connected_word not in stop_words:
                    related_kws.add(connected_word.lower())

        offset = offset + 1000

    return related_kws


# finding all (single worded) nodes directly connected to the 1st hop nodes of the given word

def get_related_keywords_hop2(related_kws_hop1):

    related_kws_hop2 = set()

    for i in related_kws_hop1:
        related_kws_hop2.add(i)

        offset = 0
        while(1):
            url = f"http://api.conceptnet.io/c/en/{i}?limit=1000&offset={offset}"

            response = requests.get(url)

            if response.status_code == 200:
                data = response.json()
                edges = data.get("edges", [])

                if(not edges):
                    break

                for edge in edges:
                    start_word = edge.get("start", {}).get("label")
                    start_lang = edge.get("start", {}).get("language", "")

                    end_word = edge.get("end", {}).get("label")
                    end_lang = edge.get("end", {}).get("language", "")

                    if start_lang != "en" or end_lang != "en":
                        continue

                    connected_word = start_word if start_word!=i else end_word
                    if(len(re.split(' |-', connected_word)) != 1):
                        continue

                    if connected_word not in stop_words:
                        related_kws_hop2.add(connected_word.lower())

            offset = offset + 1000

    return related_kws_hop2

In [10]:
# get_related_keywords_hop1("india")

In [None]:
# bert embeddings to calculate cosine similarity

emb_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
emb_model = BertModel.from_pretrained('bert-base-uncased')

In [12]:
def get_bert_embedding(word):
    tokens = emb_tokenizer(word, return_tensors='pt')

    with torch.no_grad():
        outputs = emb_model(**tokens)

    embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return embedding

def cosine_similarity(word1, embedding2):
    embedding1 = get_bert_embedding(word1)
    similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1)*np.linalg.norm(embedding2))

    return similarity


# the below function is different from the next_keyword function as it uses cosine simmilarity as a metric to select most closest word to target word
# it is used in place of relatedness value to reduce time_complexity

def next_keyword_cossim(candidate_kws, target_kw):
    next_kw = ""
    max_sim = -1
    target_embedding = get_bert_embedding(target_kw)

    for word in candidate_kws:
        curr_sim = cosine_similarity(word, target_embedding)
        if(curr_sim > max_sim):
            max_sim = curr_sim
            next_kw = word

    return next_kw

In [None]:
# loading our fine-tuned DialoGPT

response_tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/BTech_Project/ft_dialogpt_ctx_hist5_epoch1')
response_model = AutoModelForCausalLM.from_pretrained('/content/drive/MyDrive/BTech_Project/ft_dialogpt_ctx_hist5_epoch1')

In [16]:
# to suppress warnings

import transformers
transformers.logging.set_verbosity_error()

In [18]:
# we filter the 1st hop (directly connected nodes) of the target topic such that only 1st 50 closest word to the current topic are selected.
# this was done considering the fact that 2nd hop words found using these words as intermediate node will be possible intersections with current topic's 1st hop as some of those words will be more related to current topic.
# also we limit our search space of 1st hop to 50 due to the fact that it should be time efficient.

def filter_rw2_hop1(rw2_hop1, user_kw, target_kw):
    user_embedding = get_bert_embedding(user_kw)

    # initial_sim = cosine_similarity(target_kw, user_embedding)

    sim_rw2_hop1 = []

    for i in rw2_hop1:
        curr_sim = cosine_similarity(i, user_embedding)
        sim_rw2_hop1.append((curr_sim, i))

    sorted_rw2_hop1 = sorted(sim_rw2_hop1, key=lambda x: x[0], reverse=True)

    filtered_rw2_hop1 = set()
    for i in sorted_rw2_hop1[:50]:
        filtered_rw2_hop1.add(i[1])

    return filtered_rw2_hop1

In [None]:
# code to manually chat with our chatbot

target_kw = input("Target word: ")

conv = []

prev_kw = ""

rw2_hop1 = get_related_keywords_hop1(target_kw)
rw2_hop2 = ""

while(1):
    user_stmt = input("User> ")

    if user_stmt == "":
        break

    conv.append(user_stmt)

    user_kw = get_keyword(user_stmt, target_kw)

    if not prev_kw and not user_kw:
        next_kw = target_kw
        print("user keyword: ")

    else:
        if not user_kw:
            user_kw = prev_kw
        print("user keyword: " + user_kw)

        rw1_hop1 = get_related_keywords_hop1(user_kw)

        intersection = rw1_hop1.intersection(rw2_hop1)

        if not intersection:
            if rw2_hop2:
                intersection = rw1_hop1.intersection(rw2_hop2)
            else:
                filtered_rw2_hop1 = filter_rw2_hop1(rw2_hop1, user_kw, target_kw)
                rw2_hop2 = get_related_keywords_hop2(filtered_rw2_hop1)
                intersection = rw1_hop1.intersection(rw2_hop2)

        if not intersection:
            next_kw = next_keyword_cossim(rw1_hop1, target_kw)

        else:
            next_kw = next_keyword(intersection, target_kw)

    print("next keyword: ", next_kw)

    new_input_ids = response_tokenizer.encode(response_tokenizer.eos_token.join(conv[-5:]) + " KW "+ next_kw + response_tokenizer.eos_token, return_tensors='pt')

    chat_history_ids = response_model.generate(
        new_input_ids,
        max_length=1000,
        pad_token_id=response_tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7,
        temperature = 0.8,
    )

    reply = response_tokenizer.decode(chat_history_ids[0]).split(response_tokenizer.eos_token)

    reply = reply[len(reply)-2]
    conv.append(reply)

    print(f"TBot> {reply}\n")

    prev_kw = next_kw

#### self play simulation

In [None]:
og_model = BlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
og_tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/BTech_Project/self_play_input.csv")

start_utterances = df["start_utterance"].tolist()
target_topics = df["target_topic"].tolist()

In [None]:
# checking whether the target word or word extremely closely to target word is present in an utterance

def found(sentence, word):
    words = word_tokenize(sentence.lower())

    target_synsets = wn.synsets(word)

    if word in words:
        return True

    for w in words:
        w_synsets = wn.synsets(w)
        for ts in target_synsets:
            for ws in w_synsets:
                path_sim = ws.path_similarity(ts)
                if path_sim is not None and path_sim >= 0.9:
                    print(w, word)
                    return True

    return False

In [None]:
# this code conducts all chats and write it to self_play.txt 
# we used BlenderBot instead DialoGPT as it produces better responses.

for i in range(0, len(target_topics)):

    f = open("/content/drive/MyDrive/BTech_Project/self_play.txt", "a")

    # target_kw = target_topics[i]
    target_kw = "pasta"

    print("target keyword: ", target_kw)

    rw2_hop1 = get_related_keywords_hop1(target_kw)
    rw2_hop2 = ""

    conv = []

    prev_kw = ""
    user_stmt = start_utterances[i]
    conv.append(user_stmt)

    print(user_stmt, "\n")
    f.write("------------------------------------------------------------------------------------------------\n")
    f.write("target> " + target_kw + "\n\n" + "user>> " + user_stmt + "\n\n")

    num_turns = 0
    while(num_turns<8):

        user_kw = get_keyword(user_stmt, target_kw)

        if not prev_kw and not user_kw:
            next_kw = target_kw

        else:
            if not user_kw:
                user_kw = prev_kw
            print("Current Keyword: ", user_kw)

            rw1_hop1 = get_related_keywords_hop1(user_kw)

            method = "1st hop - 1st hop"
            intersection = rw1_hop1.intersection(rw2_hop1)
            if not intersection:
                method = "1st hop - 2nd hop"
                if rw2_hop2:
                    intersection = rw1_hop1.intersection(rw2_hop2)
                else:
                    filtered_rw2_hop1 = filter_rw2_hop1(rw2_hop1, user_kw, target_kw)
                    rw2_hop2 = get_related_keywords_hop2(filtered_rw2_hop1)
                    intersection = rw1_hop1.intersection(rw2_hop2)

            if not intersection:
                method = "1st hop"
                next_kw = next_keyword_cossim(rw1_hop1, target_kw)

            else:
                next_kw = next_keyword(intersection, target_kw)

        print("Next Keyword: ", next_kw)

        new_input_ids = response_tokenizer.encode(response_tokenizer.eos_token.join(conv[-5:]) + " KW "+ next_kw + response_tokenizer.eos_token, return_tensors='pt')

        chat_history_ids = response_model.generate(
            new_input_ids,
            max_length=1000,
            pad_token_id=response_tokenizer.eos_token_id,
            no_repeat_ngram_size=3,
            do_sample=True,
            top_k=100,
            top_p=0.7,
            temperature = 0.8,
        )

        reply = response_tokenizer.decode(chat_history_ids[0]).split(response_tokenizer.eos_token)

        reply = reply[len(reply)-2]

        conv.append(reply)

        print(f"TBot> {reply}\n")

        num_turns = num_turns+1

        f.write("current kw> " + user_kw + "\n" + "next kw> " + next_kw + "; method: " + method + "\n\n" + "Tbot>> " + reply + "\n")

        if(found(reply, target_kw)):
            f.write(f"\nsuccess; {num_turns}")
            f.write("\n\n")
            break

        prev_kw = next_kw

        # dialogpt------------

        # og_ids = og_tokenizer.encode(og_tokenizer.eos_token.join(conv[-5:]) + og_tokenizer.eos_token, return_tensors='pt')

        # chat_history_ids = og_model.generate(og_ids, max_length=1000, pad_token_id=og_tokenizer.eos_token_id)

        # og_reply = og_tokenizer.decode(chat_history_ids[0]).split(og_tokenizer.eos_token)

        # print(og_reply)
        # og_reply = og_reply[len(og_reply)-2]

        # blenderbot------------

        # print(conv[-5:])
        og_ids = og_tokenizer(["</s> <s>".join(conv[-5:])], return_tensors='pt')

        try:
            chat_history_ids = og_model.generate(
                **og_ids,
                # max_length = 1000,
                no_repeat_ngram_size=3,
                # do_sample=True,
                # top_k=100,
                # top_p=0.7,
                # temperature = 0.8
                )
        except:
            og_ids = og_tokenizer(["</s> <s>".join(conv[-4:])], return_tensors='pt')
            chat_history_ids = og_model.generate(
                **og_ids,
                # max_length = 1000,
                no_repeat_ngram_size=3,
                do_sample=True,
                top_k=100,
                top_p=0.7,
                temperature = 0.8)

        og_reply = og_tokenizer.batch_decode(chat_history_ids,  skip_special_tokens=True)


        conv.append(og_reply[0])

        print(f"og_bot> {og_reply[0]}\n")

        if(found(og_reply[0], target_kw)):
            f.write(f"\nsuccess; {num_turns}")
            break

        user_stmt = og_reply[0]

        f.write("Blender>> " + og_reply[0]+ "\n\n")

    print("\n\n")
    f.close()

    # break

target keyword:  pasta
i live in usa 

[('live', 'VERB', 'live'), ('usa', 'PROPN', 'usa')]
Current Keyword:  usa
Next Keyword:  money
TBot> i do not make much money, i am a stay at home mom.

og_bot>  I'm sorry to hear that. Do you have any hobbies to keep you occupied?

[('sorry', 'ADJ', 'sorry'), ('hear', 'VERB', 'hear'), ('have', 'VERB', 'have'), ('hobbies', 'NOUN', 'hobby'), ('keep', 'VERB', 'keep'), ('occupied', 'VERB', 'occupy')]
Current Keyword:  hobby
Next Keyword:  past
TBot> i love to go to the beach and read. i also like to travel

og_bot>  That sounds like a lot of fun. I wish I had the time to travel more. 

[('sounds', 'VERB', 'sound'), ('lot', 'NOUN', 'lot'), ('fun', 'NOUN', 'fun'), ('wish', 'VERB', 'wish'), ('had', 'VERB', 'have'), ('time', 'NOUN', 'time'), ('travel', 'VERB', 'travel'), ('more', 'ADJ', 'more')]
Current Keyword:  fun
Next Keyword:  salsa
TBot> yes, i do. i am really into food and love to cook, so i do not have a lot.

og_bot>  I like to cook as well. Wha

##### rough work

In [None]:
a = get_related_keywords_hop1("uv")
filtered_rw2_hop1 = filter_rw2_hop1(a, "season", "uv")
a = get_related_keywords_hop2(filtered_rw2_hop1)
print(a)
b = get_related_keywords_hop1("season")
print(b)
a.intersection(b)



{'actinotherapy', 'chemosterilizer', 'energy', 'haxx0r', 'sunbright', 'haxx', 'juicy', 'photoinhibition', 'leeter', 'sunshiny', 'antiultraviolet', 'sux0r', 'radiophotoluminescence', 'sunshineless', 'uk', 'leetspeek', 'pwned', 'masterful', 'light', 'mycosporine', 'haxxor', '4rum', 'night', 'h4x', 'grape', 'warez', 'invisible', 'sterilizer', 'pr0n', 'miss', 'actinophone', 'lolz', 'sunlessness', 'slang', 'warmth', 'nir', '0r', 'little', 'uv', 'colour', 'daylight', 'n00b', 'leets', 'xuv', 'pollock', 'cheerfulness', 'product', 'fruit', 'fuv', 'summer', 'hax0r', 'suntan', 'sunfilled', '1', 'shinier', 'radiation', 'uvc', 'warm', 'legal', 'rain', 'sky', 'darkness', 'automotive', 'sunbed', 'rays', 'photocuring', 'ray', 'octabenzone', 'nanometre', 'fir', 'sleepy', 'proly', 'l33t5p34k', 'expert', 'scotland', 'european', 'botany', 'shade', 'steriliser', 'j00', 'pwn', 'fux0r', 'l33tspeak', 'daytime', 'direct', 'seeing', 'juarez', 'sunbeam', 'leat', 'conjunctivitis', 'l337', 'leetspeak', 'leetman', 

{'rain', 'summer', 'weather'}

In [None]:
print(len(rw2_hop1))

filtered_rw2_hop1 = filter_rw2_hop1(rw2_hop1, user_kw, target_kw)
print(len(filtered_rw2_hop1))

rw2_hop2 = get_related_keywords_hop2(filtered_rw2_hop1)

419
[(0.98111075, 'dinner'), (0.9807363, 'gala'), (0.98068225, 'parents'), (0.9802759, 'celebration'), (0.9802394, 'funeral'), (0.97997487, 'festivities'), (0.97995675, 'bash'), (0.9798032, 'celebrating'), (0.97966796, 'reception'), (0.9791776, 'wine'), (0.978763, 'trick'), (0.97840494, 'fin'), (0.97809094, 'drinks'), (0.97800314, 'slang'), (0.97799754, 'ritual'), (0.9778194, 'wedding'), (0.9777198, 'playful'), (0.9776146, 'drinking'), (0.97683495, 'alice'), (0.97645015, 'enjoy'), (0.97629124, 'loud'), (0.9757295, 'camp'), (0.97565913, 'whip'), (0.9756091, 'merry'), (0.9755657, 'bolt'), (0.975233, 'gathering'), (0.97506493, 'gatherings'), (0.974483, 'fair'), (0.9742871, 'gifts'), (0.9738767, 'funny'), (0.97370344, 'time'), (0.9736212, 'ultimatum'), (0.97360355, 'bringhouse'), (0.97343177, 'candles'), (0.9730171, 'family'), (0.9727833, 'interventor'), (0.9726068, 'banquet'), (0.9725424, 'even'), (0.9724818, 'flip'), (0.97222066, 'food'), (0.97221696, 'malice'), (0.972006, 'friends'), (0

In [None]:
# time analysis for functions

import time

print(len(rw2_hop2)) # word - today; 13 mins

print(len(rw1_hop1))

st = time.time()
next_kw = next_keyword_cossim(rw1_hop1, target_kw)
et = time.time()

print(next_kw, et-st)

st = time.time()
next_kw = next_keyword(rw1_hop1, target_kw)
et = time.time()

print(next_kw, et-st)

42922
36
news 3.915778160095215
present 71.44423031806946
