## In this assignment, you'll work with a dataset called Cornell Movie--Dialogs Corpus, which was released by the Cornell University. The dataset contains conversations from more than 600 movies. 

In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import nltk
import spacy
import re

<IPython.core.display.Javascript object>

In [3]:
!pip install --ignore-installed PyYAML
!pip install chatterbot
!pip install chatterbot-corpus

Processing /Users/tarrantcarter/Library/Caches/pip/wheels/13/90/db/290ab3a34f2ef0b5a0f89235dc2d40fea83e77de84ed2dc05c/PyYAML-5.3.1-cp38-cp38-macosx_10_9_x86_64.whl
Installing collected packages: PyYAML
Successfully installed PyYAML-5.3.1
Collecting chatterbot
  Using cached ChatterBot-1.0.5-py2.py3-none-any.whl (67 kB)
Collecting pint>=0.8.1
  Using cached Pint-0.16.1-py2.py3-none-any.whl (205 kB)
Collecting pyyaml<5.2,>=5.1
  Downloading PyYAML-5.1.2.tar.gz (265 kB)
[K     |████████████████████████████████| 265 kB 329 kB/s eta 0:00:01
[?25hCollecting mathparse<0.2,>=0.1
  Using cached mathparse-0.1.2-py3-none-any.whl (7.2 kB)
Collecting pymongo<4.0,>=3.3
  Using cached pymongo-3.11.2-cp38-cp38-macosx_10_9_x86_64.whl (380 kB)
Collecting spacy<2.2,>=2.1
  Using cached spacy-2.1.9.tar.gz (30.7 MB)
  Installing build dependencies ... [?25lerror
[31m  ERROR: Command errored out with exit status 1:
   command: /Users/tarrantcarter/miniconda3/bin/python /Users/tarrantcarter/miniconda3/li

<IPython.core.display.Javascript object>

In [4]:
postgres_user = "dsbc_student"
postgres_pw = "7*.8G9QH21"
postgres_host = "142.93.121.174"
postgres_port = "5432"
postgres_db = "cornell_movie_dialogs"

engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(
        postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db
    )
)

dialogs = pd.read_sql_query("select * from dialogs", con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()


dialogs.head(10)

Unnamed: 0,index,dialogs
0,0,Can we make this quick? Roxanne Korrine and A...
1,1,"Well, I thought we'd start with pronunciation,..."
2,2,Not the hacking and gagging and spitting part....
3,3,Okay... then how 'bout we try out some French ...
4,4,You're asking me out. That's so cute. What's ...
5,5,Forget it.
6,6,"No, no, it's my fault -- we didn't have a prop..."
7,7,Cameron.
8,8,"The thing is, Cameron -- I'm at the mercy of a..."
9,9,Seems like she could get a date easy enough...


<IPython.core.display.Javascript object>

## First, do some data preprocessing to clean up the data. You can use your solution to the assignment of the Text preprocessing checkpoint.

In [5]:
dialogs2 = dialogs.drop(columns="index")
dialogs2.head()

Unnamed: 0,dialogs
0,Can we make this quick? Roxanne Korrine and A...
1,"Well, I thought we'd start with pronunciation,..."
2,Not the hacking and gagging and spitting part....
3,Okay... then how 'bout we try out some French ...
4,You're asking me out. That's so cute. What's ...


<IPython.core.display.Javascript object>

In [6]:
# Utility function for standard text cleaning
def text_cleaner(text):
    type(text)
    # Visual inspection identifies a form of punctuation that spaCy does not
    # recognize: the double dash --.  Better get rid of it now!
    text = re.sub(r"--", " ", text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = " ".join(text.split())
    return text

<IPython.core.display.Javascript object>

In [7]:
dialogs3 = []

for dialog in dialogs2["dialogs"]:
    cleaned = text_cleaner(dialog)
    dialogs3.append(cleaned)

dialogs4 = pd.DataFrame(dialogs3, columns=["dialogs"])
dialogs4.head()

Unnamed: 0,dialogs
0,Can we make this quick? Roxanne Korrine and An...
1,"Well, I thought we'd start with pronunciation,..."
2,Not the hacking and gagging and spitting part....
3,Okay... then how 'bout we try out some French ...
4,You're asking me out. That's so cute. What's y...


<IPython.core.display.Javascript object>

In [8]:
nlp = spacy.load("en", disable=["parser", "ner"])
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.max_length = 20000000
doc = nlp(" ".join(dialogs4["dialogs"]))

<IPython.core.display.Javascript object>

In [9]:
print("The doc object is a {} object.".format(type(doc)))
print("It is {} tokens long".format(len(doc)))
print("The first three tokens are '{}'".format(doc[:3]))
print("The type of each token is {}".format(type(doc[0])))

The doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 4162681 tokens long
The first three tokens are 'Can we make'
The type of each token is <class 'spacy.tokens.token.Token'>


<IPython.core.display.Javascript object>

In [10]:
# make dialog sentences of more than one word
sents = [sent.text for sent in doc.sents if len(sent.text) > 1]

<IPython.core.display.Javascript object>

## Develop a chatbot using this corpus. In doing this, you're free to choose a chatbot development library like ChatterBot or write your own code from scratch.

In [11]:
GREETING_INPUTS = ["hello", "hi", "greetings", "what's up", "hey"]
GREETING_RESPONSES = ["hello", "hi", "hey", "hi there"]


def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

<IPython.core.display.Javascript object>

In [12]:
from chatterbot import ChatBot
from chatterbot.trainers import ListTrainer, ChatterBotCorpusTrainer
from chatterbot.conversation import Statement

<IPython.core.display.Javascript object>

In [13]:
# create a chatbot
chatbot = ChatBot("Dialogs")
# this is to remove the accumulated knowledge base
chatbot.storage.drop()

# create a new trainer for the chatbot
trainer = ListTrainer(chatbot)

# train the chatbot
trainer.train(sents)

List Trainer: [####################] 100%


<IPython.core.display.Javascript object>

## Start a conversation with your chatbot, and discuss its strengths and weaknesses.

In [19]:
import random

<IPython.core.display.Javascript object>

In [20]:
bot_name = "Movie Quoting Machine"
print(
    f"{bot_name}: I will try to respond to you reasonably. If you want to exit, type bye."
)

# Below is the chatting
while True:

    user_input = input("User: ")
    user_input = user_input.lower()

    if user_input != "bye":
        if user_input == "thanks" or user_input == "thank you":
            break
            print(f"{bot_name}: You're welcome.")
        else:
            if greeting(user_input) != None:
                print(f"{bot_name}: " + greeting(user_input))
            else:
                print(f"{bot_name}: ", end="")
                print(chatbot.get_response(user_input))
    else:
        print(f"{bot_name}: Bye! It was a great chat.")
        break

Movie Quoting Machine: I will try to respond to you reasonably. If you want to exit, type bye.
User: hello
Movie Quoting Machine: hi
User: how are you?
Movie Quoting Machine: Takin' a bath.
User: sounds fun
Movie Quoting Machine: Yeah.
User: you busy?
Movie Quoting Machine: I figure we'll get over to the hotel... get checked in, cleaned up... then I'd like to do something I've been thinking about for a long time.
User: What is that thing?
Movie Quoting Machine: I think that thing is Ben.
User: Who is Ben?
Movie Quoting Machine: You never wanted to go out with 'me, did you?
User: No I don't
Movie Quoting Machine: The sub-surface structure shows that it was deliberately buried about four million years ago.
User: that was akward
Movie Quoting Machine: Forget French.
User: your confusing me
Movie Quoting Machine: Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.
User: That's good to know. I guess...
Movie Quoting Machine: Well, it's just f

<IPython.core.display.Javascript object>