In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [22]:
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split as ttsa
from sklearn.linear_model import LogisticRegression

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Flatten, Embedding

import markovify

import random

<IPython.core.display.Javascript object>

In [3]:
import nltk

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wyattgarner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/wyattgarner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/wyattgarner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

<IPython.core.display.Javascript object>

##### Goals
* Predict movie genre from conversation
* Generate dialogue
* Chatbot

### Read and Process Data

In [4]:
with open("18754_24465_bundle_archive/movie_lines.txt", encoding="latin-1") as f:
    lines = f.readlines()

movie_lines = []

for elem in lines:
    line_list = elem.strip("\n").split(" +++$+++ ")
    movie_lines.append(line_list)

<IPython.core.display.Javascript object>

In [5]:
with open(
    "18754_24465_bundle_archive/movie_conversations.txt", encoding="latin-1"
) as f:
    lines = f.readlines()

movie_conversations = []

for elem in lines:
    line_list = elem.strip("\n").split(" +++$+++ ")
    line_list[-1] = line_list[-1].strip("][").split(", ")
    movie_conversations.append(line_list)

    genres = []
    for i in line_list[-1]:
        i = i.strip("'")
        genres.append(i)
    line_list[-1] = genres

<IPython.core.display.Javascript object>

In [6]:
with open(
    "18754_24465_bundle_archive/movie_characters_metadata.txt", encoding="latin-1"
) as f:
    lines = f.readlines()

characters = []

for elem in lines:
    line_list = elem.strip("\n").split(" +++$+++ ")
    characters.append(line_list)

characters = characters[1:]

<IPython.core.display.Javascript object>

In [7]:
with open(
    "18754_24465_bundle_archive/movie_titles_metadata.txt", encoding="latin-1"
) as f:
    lines = f.readlines()

movie_titles = []

for elem in lines:
    line_list = elem.strip("\n").split(" +++$+++ ")
    line_list[-1] = line_list[-1].strip("][").split(", ")

    genres = []
    for i in line_list[-1]:
        i = i.strip("'")
        genres.append(i)
    line_list[-1] = genres

    movie_titles.append(line_list)

<IPython.core.display.Javascript object>

In [8]:
%%time

movie_d = dict()
for movie_title in movie_titles:
    line_list= []
    small_d = dict()
    for conversation in movie_conversations:
        if movie_title[0] == conversation[2]:
            for line in movie_lines:
                if line[0] in conversation[-1]:
                    line_list.append(line[-1])
    
    small_d['title'] = movie_title[1]
    small_d['genres'] = movie_title[-1]
    small_d['lines'] = line_list
    movie_d[movie_title[0]] = small_d
    


CPU times: user 1h 46min 5s, sys: 3min 37s, total: 1h 49min 42s
Wall time: 1h 55min 48s


<IPython.core.display.Javascript object>

### Genre Classification 
* Training set is dialogue from movies grouped by genre
* Test set is dialogue from individual movies

In [9]:
train_movies = dict(list(movie_d.items())[:501])
test_movies = dict(list(movie_d.items())[501:])

<IPython.core.display.Javascript object>

In [10]:
%%time

genre_d = dict()
for k, v in train_movies.items():
    for genre in v["genres"]:
        if genre not in genre_d.keys():
            genre_d[genre] = v["lines"]
        else:
            genre_d[genre].extend(v["lines"])

CPU times: user 17.7 ms, sys: 9.71 ms, total: 27.4 ms
Wall time: 31.2 ms


<IPython.core.display.Javascript object>

In [11]:
all_dialogue = []
for elem in genre_d.values():
    all_dialogue.append(" ".join(elem))

<IPython.core.display.Javascript object>

In [12]:
def preprocess(docs):
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    preprocessed = []

    for doc in docs:
        tokenized = word_tokenize(doc)

        cleaned = [
            stemmer.stem(lemmatizer.lemmatize(token.lower()))
            for token in tokenized
            if not token.lower() in stopwords.words("english")
            if token.isalpha()
        ]

        untokenized = " ".join(cleaned)
        preprocessed.append(untokenized)

    return preprocessed

<IPython.core.display.Javascript object>

In [13]:
%%time

preprocessed = preprocess(all_dialogue)

CPU times: user 1h 15min 6s, sys: 15min 53s, total: 1h 31min
Wall time: 1h 31min 43s


<IPython.core.display.Javascript object>

In [14]:
y_train = list(genre_d.keys())

<IPython.core.display.Javascript object>

#### Random Forest

In [18]:
model = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", RandomForestClassifier(n_estimators=100)),
    ]
)

model.fit(preprocessed, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())])

<IPython.core.display.Javascript object>

In [19]:
model.score(preprocessed, y_train)

0.68

<IPython.core.display.Javascript object>

In [20]:
yes = 0
no = 0
for key in test_movies.keys():
    test = []
    test.append(" ".join(test_movies[key]["lines"]))
    test = preprocess(test)
    predictions = model.predict(test)
    if predictions in test_movies[key]["genres"]:
        yes += 1
    else:
        no += 1

print(yes / (yes + no))

0.0


<IPython.core.display.Javascript object>

#### Logistic Regression

In [23]:
model = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", LogisticRegression()),
    ]
)

model.fit(preprocessed, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression())])

<IPython.core.display.Javascript object>

In [24]:
model.score(preprocessed, y_train)

0.68

<IPython.core.display.Javascript object>

In [25]:
yes = 0
no = 0
for key in test_movies.keys():
    test = []
    test.append(" ".join(test_movies[key]["lines"]))
    test = preprocess(test)
    predictions = model.predict(test)
    if predictions in test_movies[key]["genres"]:
        yes += 1
    else:
        no += 1

print(yes / (yes + no))

0.04310344827586207


<IPython.core.display.Javascript object>

#### Gradient Boosting Classifier

In [26]:
model = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", GradientBoostingClassifier()),
    ]
)

model.fit(preprocessed, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', GradientBoostingClassifier())])

<IPython.core.display.Javascript object>

In [27]:
model.score(preprocessed, y_train)

0.68

<IPython.core.display.Javascript object>

In [28]:
yes = 0
no = 0
for key in test_movies.keys():
    test = []
    test.append(" ".join(test_movies[key]["lines"]))
    test = preprocess(test)
    predictions = model.predict(test)
    if predictions in test_movies[key]["genres"]:
        yes += 1
    else:
        no += 1

print(yes / (yes + no))

0.034482758620689655


<IPython.core.display.Javascript object>

The test accuracy of all three models is abysmal but logistic regression performed the best.

### Text Generation
* Generate text from each genre of movie

In [29]:
tokenized_list = []
for dialogue in all_dialogue:
    tokenized = sent_tokenize(dialogue)
    tokenized_list.append(tokenized)

<IPython.core.display.Javascript object>

In [106]:
count = 0
for dialogue in tokenized_list:
    print(list(genre_d.keys())[count], "\n")
    count += 1
    model = markovify.Text(dialogue, state_size=5)
    for i in range(10):
        print(model.make_short_sentence(max_chars=200, min_chars=30, tries=100))
    print("`" * 100, "\n")

comedy 

Yes, look, I wanted to talk to you about that when we're alone...
You stole this truck, and now you're trying to do the same thing!
Better than spending my life crawling through windows in the middle of the night to meet that bitch in a bar...
You know, Dwight, I hear you're the only one who came up positive for drugs?
-- I want you to give me your word you won't come in.
I'd be very interested to know what kind of shoes you had on?
It is difficult to express but I'd rather not go to the trouble of bringing up those birds.
Up at Pee Dee, I couldn't remember any of the reasons I loved him.
I know, Sister, but I have a very good relationship with all the salesmen.
What the hell am I supposed to do for twenty-four hours?
```````````````````````````````````````````````````````````````````````````````````````````````````` 

romance 

These people are paying you a lot, do you think you could give me something to cement the deal...
What's the good of -- Look, Walter, I came up here t

Forgive me now for seeming personal, but we understand that there is a great respons- ibility in representing the Federation.
I told you before that I would like to have died in peace...
You are to have nothing to do with the reason he came here?
No -- I love you so much and I want you to know I thought about what you said...
I don't like it either, but I'm just trying to find an explanation for the continual reappearances.
You're missing most of your life, Jim, and you don't even know how long ago it was.
This doesn't come out of nowhere, there has to be a darn good reason for wanting them kept out of it...
All I know is that I might be able to find something on the ship.
I want to meet with representatives from all the nations of the Earth -- but I was not allowed the Opportunity.
What if I had nothing to do with the investigation into Bill Tuggle's death.
```````````````````````````````````````````````````````````````````````````````````````````````````` 

sci-fi 

We're going down 

I believe that something happened here and I want to check it out ..
A meteorite just hit the ground near here and I want to cooperate in every way I can.
I believe that something happened here and I want to check it out ..
I believe that something happened here and I want to cooperate in every way I can.
I believe that something happened here and I want to check it out ..
I believe that something happened here and I want to cooperate in every way I can.
```````````````````````````````````````````````````````````````````````````````````````````````````` 

sport 

None
If he ain't gonna listen to you, he ain't gonna listen to you, he ain't gonna listen to nobody!
None
He ain't gonna get a shot at the title because there's nobody else.
None
He ain't gonna get a shot at the title because there's nobody else.
None
He ain't gonna get a shot at the title because there's nobody else.
He ain't gonna get a shot at the title because there's nobody else.
He ain't gonna get a shot at the title bec

<IPython.core.display.Javascript object>

Generated text from most of the genres is comprehensible.