In [None]:
!pip install spacy

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import spacy
import en_core_web_lg
nlp = spacy.load('en_core_web_lg')

nlp = en_core_web_lg.load()

In [None]:
nlp.vocab['amazing'].vector # replace cheese with whatever word you want!

In [None]:
!curl -L -O http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip

In [None]:
!unzip cornell_movie_dialogs_corpus.zip

In [None]:
movie_lines = {}
for line in open("./cornell movie-dialogs corpus/movie_lines.txt",
                 encoding="latin1"):
    line = line.strip()
    parts = line.split(" +++$+++ ")
    if len(parts) == 5:
        movie_lines[parts[0]] = parts[4]
    else:
        movie_lines[parts[0]] = ""

In [2]:
import json

In [None]:
import json
responses = {}
for line in open("./cornell movie-dialogs corpus/movie_conversations.txt",
                 encoding="latin1"):
    line = line.strip()
    parts = line.split(" +++$+++ ")
    line_ids = json.loads(parts[3].replace("'", '"'))
    for first, second in zip(line_ids[:-1], line_ids[1:]):
        responses[first] = second

Just to make sure everything works, the cell below prints out five random pairs of conversational turns from the corpus:

In [None]:
import random
for pair in random.sample(responses.items(), 5):
    print("A:", movie_lines[pair[0]])
    print("B:", movie_lines[pair[1]])
    print()

### Making a sentence vector

To make the sentence vector for each line of dialog, i am using spaCy. The function `sentence_mean` below takes the spaCy object that we loaded earlier (`nlp`) and uses it to tokenize the string that you pass into the function (i.e., break it up into words). It then uses numpy's `mean()` function to find the average of the vectors, producing a new vector. The shape of the resulting vector (i.e., the number of dimensions) should be the same as the shape of the individual word vectors.

(Note: I disabled the `tagger` and `parser` parts of spaCy's pipeline to improve performance. We're not using part of speech tags or dependency relations in this chatbot, so there's no reason to spend time calculating them.)

In [None]:
import numpy as np
def sentence_mean(nlp, s):
    if s == "":
        s = " "
    doc = nlp(s, disable=['tagger', 'parser'])
    return np.mean(np.array([w.vector for w in doc]), axis=0)
sentence_mean(nlp, "This... is a test.").shape

In [None]:
!pip install simpleneighbors

In [None]:
from simpleneighbors import SimpleNeighbors

nns = SimpleNeighbors(300)
for i, line_id in enumerate(random.sample(list(responses.keys()), 10000)):
    # show progress
    if i % 1000 == 0: print(i, line_id, movie_lines[line_id])
    line_text = movie_lines[line_id]
    summary_vector = sentence_mean(nlp, line_text)
    if np.any(summary_vector):
        nns.add_one(line_id, summary_vector)
nns.build()

In [None]:
sentence = "I like making bots."
picked = nns.nearest(sentence_mean(nlp, sentence), 5)[0]
response_line_id = responses[picked]

print("Your line:\n\t", sentence)
print("Most similar turn:\n\t", movie_lines[picked])
print("Response to most similar turn:\n\t", movie_lines[response_line_id])

In [None]:
!pip install https://github.com/aparrish/semanticsimilaritychatbot/archive/master.zip

Then create a chatbot object, passing in the spaCy language object (`nlp`) and the number of dimensions:

In [None]:
from semanticsimilaritychatbot import SemanticSimilarityChatbot
chatbot = SemanticSimilarityChatbot(nlp, 300)

The `.add_pair()` method in the object takes two strings: a turn and the response to that turn. We'll get these from the `responses` and `movie_lines` lookups, again sampling ten thousand pairs at random. This cell will take a little while to run:

In [None]:
sample_n = 10000
for first_id, second_id in random.sample(list(responses.items()), sample_n):
    chatbot.add_pair(movie_lines[first_id], movie_lines[second_id])
chatbot.build()

In [None]:
print(chatbot.response_for("Hello computer!"))

In [None]:
my_turn = "The weather's nice today, don't you think?"
for i in range(5, 51, 5):
    print("picking from", i, "possible responses:")
    print(chatbot.response_for(my_turn, i))
    print()

The Semantic Similarity Chatbot object has a `.save()` method that saves the pre-built database to disk, using a filename prefix you supply. (It saves three different files: `<prefix>.annoy`, `<prefix>-data.pkl`, and `<prefix>-chatbot.pkl`).

In [None]:
chatbot.save("movielines-10k-sample")

In [None]:
chatbot = SemanticSimilarityChatbot.load("movielines-10k-sample", nlp)

In [None]:
print(chatbot.response_for("I'm going to go get some coffee."))

In [None]:
from google.colab import files
files.download('movielines-10k-sample.annoy')
files.download('movielines-10k-sample-data.pkl')
files.download('movielines-10k-sample-chatbot.pkl')

## Making it interactive

If you're using this notebook in Google Colab, the following cell will create a little interactive interface for chatting with the bot that you just built. Run the two cells below and start typing into the box.

In [None]:
chatbot_html = """
<style type="text/css">#log p { margin: 5px; font-family: sans-serif; }</style>
<div id="log"
     style="box-sizing: border-box;
            width: 600px;
            height: 32em;
            border: 1px grey solid;
            padding: 2px;
            overflow: scroll;">
</div>
<input type="text" id="typehere" placeholder="type here!"
       style="box-sizing: border-box;
              width: 600px;
              margin-top: 5px;">
<script>
function paraWithText(t) {
    let tn = document.createTextNode(t);
    let ptag = document.createElement('p');
    ptag.appendChild(tn);
    return ptag;
}
document.querySelector('#typehere').onchange = async function() {
    let inputField = document.querySelector('#typehere');
    let val = inputField.value;
    inputField.value = "";
    let resp = await getResp(val);
    let objDiv = document.getElementById("log");
    objDiv.appendChild(paraWithText('😀: ' + val));
    objDiv.appendChild(paraWithText('🤖: ' + resp));
    objDiv.scrollTop = objDiv.scrollHeight;
};
async function colabGetResp(val) {
    let resp = await google.colab.kernel.invokeFunction(
        'notebook.get_response', [val], {});
    return resp.data['application/json']['result'];
}
async function webGetResp(val) {
    let resp = await fetch("/response.json?sentence=" + 
        encodeURIComponent(val));
    let data = await resp.json();
    return data['result'];
}
</script>
"""

In [None]:
 import IPython
from google.colab import output

display(IPython.display.HTML(chatbot_html + \
                             "<script>let getResp = colabGetResp;</script>"))

def get_response(val):
    resp = chatbot.response_for(val)
    return IPython.display.JSON({'result': resp})

output.register_callback('notebook.get_response', get_response)

If you're not using Colab, try the following two cells to install [Flask](http://flask.pocoo.org) and run a little web server from your notebook that lets you chat with the bot. Click on the link that appears below the second cell to open up the chat in a new window.

In [None]:
!pip install flask

In [None]:
from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route("/response.json")
def response():
    sentence = request.args['sentence']
    return jsonify(
        {'result': chatbot.response_for(sentence)})
@app.route("/")
def home():
    return chatbot_html + "<script>let getResp = webGetResp;</script>"
app.run()