# Machine Learning Nanodegree Capstone
# Chat Room Recommendation

In [3]:
import os
import collections
import random

### Load Data

In [4]:
# path will need to be changed pending on where the repo is cloned to
os.chdir(os.path.expanduser('~/PycharmProjects/chat-room-recommendation/'))
lines = open('cornell-movie-dialogs-corpus/movie_lines.txt','r').read().split('\n')
conv_lines = open('cornell-movie-dialogs-corpus/movie_conversations.txt','r').read().split('\n')
character_metadata = open('cornell-movie-dialogs-corpus/movie_characters_metadata.txt','r').read().split('\n')
movie_metadata = open('cornell-movie-dialogs-corpus/movie_titles_metadata.txt','r').read().split('\n')

In [5]:
lines[:10]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.",
 'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow',
 "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.",
 'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No',
 'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?']

In [6]:
conv_lines[:10]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']"]

Use gensim simple pre-processing tool to create a dictionary with keys = movie_id and value = tokenized text of all the lines in the movie.

In [7]:
from gensim import utils
movieLines = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        if _line[2] in movieLines: 
            movieLines[_line[2]] = movieLines.get(_line[2]) + utils.simple_preprocess(_line[4])
        else:
            movieLines[_line[2]] = utils.simple_preprocess(_line[4])

In [8]:
# sanity check that there are the appropriate number of movies in the movieLines dict
print "Correct # of movies" if len(movieLines) == 617 else "something went wrong with movieLines dict"
#print movieLines.get("m616")

Correct # of movies


Create a dictionary to map each line's id with it's text

In [9]:
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [10]:
# sanity check for id2line dict
print "Correct # of lines" if len(id2line) == 304713 else "something went wrong with id2line dict"

Correct # of lines


Use gensim simple pre-processing tool to create a dictionary with keys = character_id and value = tokenized text of all conversations for that character.

In [11]:
characterConversations = {}
for line in conv_lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 4:
        _line[3] = _line[3].strip("[]")
        for conv in _line[3].split(","):
            conv = conv.replace("'","").replace(" ", "")
            if _line[0] in characterConversations:
                characterConversations[_line[0]] = characterConversations.get(_line[0]) + \
                                                   utils.simple_preprocess(id2line.get(str(conv)))
            else:
                characterConversations[_line[0]] = utils.simple_preprocess(id2line.get(str(conv)))

            if _line[1] in characterConversations:
                characterConversations[_line[1]] = characterConversations.get(_line[1]) + \
                                                   utils.simple_preprocess(id2line.get(str(conv)))
            else:
                characterConversations[_line[1]] = utils.simple_preprocess(id2line.get(str(conv)))

In [12]:
# sanity check that there are the appropriate # of characters in the characterConversation dict
print "Correct # of characters" if len(characterConversations) == 9035 else "something went wrong with character dict"
#print characterConversations.get("u0")

Correct # of characters


In [13]:
# function that creates corpus (list of TaggedDocmuments) from dictionaries
from gensim.models.doc2vec import TaggedDocument


def create_corpus(dictname):
    corpus_list =[]
    for key, value in dictname.iteritems():
        corpus_list.append(TaggedDocument(value, [int(key[1:])]))
    return corpus_list

In [14]:
train_corpus = create_corpus(movieLines)
test_corpus = create_corpus(characterConversations)

In [15]:
# sanity check of length of corpus
print "Correct # of movies in train_corpus" if len(train_corpus) == 617 else "something went wrong with train_corpus"
print "Correct # of characters in test_corpus" if len(test_corpus) == 9035 else "something went wrong with test_corpus"
# print train_corpus[0].tags

Correct # of movies in train_corpus
Correct # of characters in test_corpus


Instantiate a Doc2Vec Object

In [16]:
from gensim.models import Doc2Vec

model = Doc2Vec(size=50, iter=20, min_count=2)

In [17]:
model.build_vocab(train_corpus)

In [18]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

CPU times: user 8min 31s, sys: 3.56 s, total: 8min 35s
Wall time: 3min 40s


44391323

In [19]:
# save model to be loaded later if needed
model.save('/tmp/movie_model.doc2vec')

In [19]:
# load model if saved during a previous session
from gensim.models import Doc2Vec

model = Doc2Vec.load('/tmp/movie_model.doc2vec')

## Assessing Model
To assess the doc2vec model, inferring new vectors for each document of the training corpus, compare the inferred vectors with the training corpus, and then returning the rank of the document based on self-similarity. This approach is pretending as if the training corpus is some new unseen data and then seeing how they compare with the trained model. The expectation is that model will be overfit and so finding similar documents should be very easily. The second ranks will also be tracked for comparison of less similar docutents. (https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb)  

In [22]:
ranks = []
first_ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):

    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    
    rank = [docid for docid, sim in sims].index(train_corpus[doc_id].tags[0])
        
    ranks.append(rank) 
    
    first_ranks.append(sims[0])
    second_ranks.append(sims[1])

In [23]:
collections.Counter(ranks)

Counter({0: 615, 1: 2})

* Doc2Vec(size=30, iter=20) resulted in Counter({0: 614, 1: 3}) <br/>
trained in : CPU times: user 2min 24s, sys: 732 ms, total: 2min 24s Wall time: 54.1 s <br/>
* Doc2Vec(size=30, iter=50) resulted in Counter({0: 615, 1: 2})<br/>
trained in : CPU times: user 6min, sys: 1.75 s, total: 6min 2s Wall time: 2min 17s
* Doc2Vec(size=30, iter=20, min_count=2) resulted in Counter({0: 615, 1: 2}) <br/>
trained in : CPU times: user 2min 18s, sys: 712 ms, total: 2min 19s Wall time: 56.6 s
* Doc2Vec(size=30, iter=50, min_count=2) resulted in Counter({0: 614, 1: 3}) <br/>
trained in : CPU times: user 5min 21s, sys: 1.96 s, total: 5min 23sWall time: 2min
* Doc2Vec(size=50, iter=20, min_count=2) resulted in Counter({0: 616, 1: 1})
trained in : CPU times: user 2min 21s, sys: 752 ms, total: 2min 22s Wall time: 52 s

## Sensitivity analysis
Test Doc2Vec model with random lines removed from the training_corpus. The model will be assessed the same way the original training_corpus was assessed.  

In [24]:
# altlines = open('cornell-movie-dialogs-corpus/movie_lines_mod.txt','r').read().split('\n')
from random import randint
from gensim import utils

altMovieLines = {}
altTotalLines = 0
for line in lines:
    if randint(0, 9) != 7:
        altTotalLines = altTotalLines+1
        _line = line.split(' +++$+++ ')
        if len(_line) == 5:
            if _line[2] in altMovieLines: 
                altMovieLines[_line[2]] = altMovieLines.get(_line[2]) + utils.simple_preprocess(_line[4])
            else:
                altMovieLines[_line[2]] = utils.simple_preprocess(_line[4])

In [25]:
print "Correct # of movies" if len(altMovieLines) == 617 else "something went wrong with movieLines dict"
print "# of lines were decreased" if altTotalLines < 304713 else "something went wrong with id2line dict"
print altTotalLines

Correct # of movies
# of lines were decreased
274275


In [26]:
alt_train_corpus = create_corpus(altMovieLines)

In [27]:
alt_model = Doc2Vec(size=50, iter=20, min_count=2)

In [28]:
alt_model.build_vocab(alt_train_corpus)

In [29]:
%time alt_model.train(alt_train_corpus, total_examples=alt_model.corpus_count, epochs=alt_model.iter)

CPU times: user 7min 11s, sys: 2.87 s, total: 7min 14s
Wall time: 2min 38s


39987989

In [30]:
alt_model.save('/tmp/alt_model.doc2vec')

In [31]:
from gensim.models import Doc2Vec

alt_model = Doc2Vec.load('/tmp/alt_model.doc2vec')

In [32]:
alt_ranks = []

for alt_doc_id in range(len(alt_train_corpus)):

    alt_inferred_vector = alt_model.infer_vector(alt_train_corpus[alt_doc_id].words)
    alt_sims = alt_model.docvecs.most_similar([alt_inferred_vector], topn=len(model.docvecs))
    
    alt_rank = [alt_docid for alt_docid, alt_sim in alt_sims].index(alt_train_corpus[alt_doc_id].tags[0])
        
    alt_ranks.append(alt_rank)

In [33]:
collections.Counter(alt_ranks)

Counter({0: 614, 1: 3})

Test Doc2Vec model with test_corpus (individual character converations) which consist of 9035 different documents.  The model will be assessed the same way the training_corpus was assessed. 

In [34]:
model2 = Doc2Vec(size=50, iter=20, min_count=2)

In [35]:
model2.build_vocab(test_corpus)

In [36]:
%time model2.train(test_corpus, total_examples=model2.corpus_count, epochs=model2.iter)

CPU times: user 17min 24s, sys: 10.6 s, total: 17min 35s
Wall time: 7min 22s


89876212

In [37]:
model2.save('/tmp/character_model.doc2vec')

In [38]:
from gensim.models import Doc2Vec

model2 = Doc2Vec.load('/tmp/character_model.doc2vec')

In [39]:
ranks2 = []

for doc_id2 in range(len(test_corpus)):

    inferred_vector2 = model2.infer_vector(test_corpus[doc_id2].words)
    sims2 = model2.docvecs.most_similar([inferred_vector2], topn=len(model2.docvecs))
    
    rank2 = [docid2 for docid2, sim2 in sims2].index(test_corpus[doc_id2].tags[0])
        
    ranks2.append(rank2)

In [40]:
collections.Counter(ranks2)

Counter({0: 8238,
         1: 453,
         2: 80,
         3: 45,
         4: 37,
         5: 19,
         6: 24,
         7: 10,
         8: 16,
         9: 13,
         10: 11,
         11: 9,
         12: 7,
         13: 2,
         14: 1,
         15: 5,
         16: 2,
         17: 2,
         18: 1,
         19: 3,
         20: 3,
         21: 3,
         22: 3,
         23: 6,
         24: 1,
         25: 1,
         26: 1,
         27: 2,
         28: 2,
         34: 1,
         35: 1,
         36: 1,
         39: 2,
         41: 1,
         42: 1,
         44: 2,
         46: 1,
         47: 1,
         48: 3,
         50: 1,
         53: 1,
         54: 1,
         59: 1,
         62: 1,
         63: 1,
         76: 1,
         78: 1,
         80: 1,
         86: 1,
         87: 1,
         91: 1,
         92: 2,
         119: 1,
         142: 1,
         143: 1,
         228: 1,
         264: 1,
         441: 1,
         5017: 1})

## Utility functions

In [41]:
# get character metadata by id
def get_character_metadata(id):
    for character in character_metadata:
        _character = character.split(' +++$+++ ')
        if len(_character) == 6:
            if _character[0] == 'u'+str(id):
                return _character
        
           
def get_movie_title(id):
    for movie in movie_metadata:
        _movie = movie.split(' +++$+++ ')
        if len(_movie) == 6:
            if _movie[0] == 'm'+str(id):
                return _movie[1]
            
            
# create def to look up tag doc in corpus by tag id
def get_corpus_index(corpus, tag):
    for tag_index in range(len(corpus)):
        if corpus[tag_index].tags[0] == tag:
            return tag_index

Simple test of the first movie in the training corpus to ensure that the movie was returned as the most similar document by the model.  The similarity score is also displayed. A random document from the training coupus is aslo selected along and compared with the similarity score of the second most similar document.  

In [42]:
print('Movie title : {}\nMovie id : {}\n'.format(get_movie_title(train_corpus[doc_id].tags[0]), train_corpus[doc_id].tags[0]))
print(u'MODEL %s:\n' % model)

print get_movie_title(train_corpus[get_corpus_index(train_corpus, sims[0][0])].tags[0])
print sims[0], '\n\n----------------------\n'

# Pick a random document from the train corpus and infer a vector from the model
rand_train_id = random.randint(0, len(train_corpus))

# Inspect the score of the second ranked movie. 
# The score for the second ranked movie should be much lower then for the top document.
print('Train Document ({}): «{}»\n'.format(rand_train_id, get_movie_title(rand_train_id)))
top_id = first_ranks[get_corpus_index(train_corpus, rand_train_id)]
print('Top Document {}: «{}»\n'.format(top_id, get_movie_title(top_id[0])))
sim_id = second_ranks[get_corpus_index(train_corpus, rand_train_id)]
print('Similar Document {}: «{}»\n'.format(sim_id, get_movie_title(sim_id[0])))

Movie title : star wars
Movie id : 529

MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

star wars
(529, 0.9903789758682251) 

----------------------

Train Document (35): «blast from the past»

Top Document (35, 0.987032413482666): «blast from the past»

Similar Document (462, 0.7288445234298706): «notting hill»



In [43]:
# Movie 193 "star trek: the wrath of khan" returns "star trek iii: the search for spock" as the second most 
# similar document. * I guess if you have seen one Star Trek movie you have seen them all... 
rand_train_id = 193

# Inspect the score of the second ranked movie
print('Train Document ({}): «{}»\n'.format(rand_train_id, get_movie_title(rand_train_id)))
top_id = first_ranks[get_corpus_index(train_corpus, rand_train_id)]
print('Top Document {}: «{}»\n'.format(top_id, get_movie_title(top_id[0])))
sim_id = second_ranks[get_corpus_index(train_corpus, rand_train_id)]
print('Similar Document {}: «{}»\n'.format(sim_id, get_movie_title(sim_id[0])))

Train Document (193): «star trek: the wrath of khan»

Top Document (193, 0.991351842880249): «star trek: the wrath of khan»

Similar Document (192, 0.9387556314468384): «star trek iii: the search for spock»



## Testing the model
Using the same approach above, we'll infer the vector for a randomly chosen test document (character conversation), and compare the document to the model by eye.


In [44]:
# find similarity of movie character is from and index 
def get_source_similarity(sim_list, movie_id):
    for sim_index in range(len(sim_list)):
        if int(movie_id[1:]) == sim_list[sim_index][0]:
            return get_movie_title(sim_list[sim_index][0]), sim_index, sim_list[sim_index][1]
        
        
# print top 5 recommended movies
def get_recommended_movies(sim_list):
    print('\nTop 5\n {}\n {}\n {}\n {}\n {}\n'.format(get_movie_title(sim_list[0][0]), get_movie_title(sim_list[1][0]), 
                                                get_movie_title(sim_list[2][0]), get_movie_title(sim_list[3][0]),
                                                get_movie_title(sim_list[4][0])))
    
    
def display_character_similarity(char_id, show_words):
    inferred_vector = model.infer_vector(test_corpus[char_id].words)
    test_sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    character_metadata = get_character_metadata(char_id)
    print('Character : {}\n ID : {}\n Movie : {}'.format(character_metadata[1], character_metadata[2], 
                                                         character_metadata[3]))
    if show_words :
        print('Test Document ({}): «{}»\n'.format(char_id, ' '.join(test_corpus[get_corpus_index(test_corpus, char_id)].words)))
    print test_sims[:10]
    get_recommended_movies(test_sims)
    print get_source_similarity(test_sims, character_metadata[2]), '\n'
    
    
def get_doc2vec_similarity(char_id):
    inferred_vector = model.infer_vector(test_corpus[char_id].words)
    test_sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

    return test_sims[:10]

In [45]:
# Two characters from the same movie 
display_character_similarity(1220, True)
display_character_similarity(1223, True)

Character : EDDIE
 ID : m80
 Movie : halloween h20: 20 years later
Test Document (1220): «besides it historically inaccurate what the fuck are you talking about michael meyers never used meat cleaver it was butcher knife who are you the serial killer police what difference does it make it not historically accurate that all another historical inaccuracy would somebody shut this guy up»

[(379, 0.7948283553123474), (602, 0.7291299700737), (460, 0.6810026168823242), (359, 0.6802983283996582), (7, 0.6716787815093994), (148, 0.6479409337043762), (8, 0.6213617920875549), (510, 0.6147223711013794), (601, 0.6047148704528809), (227, 0.6002674698829651)]

Top 5
 halloween
 what women want
 a nightmare on elm street part 2: freddy's revenge
 friday the 13th part iii
 a nightmare on elm street 4: the dream master

('halloween h20: 20 years later', 35, 0.5313981175422668) 

Character : KERI
 ID : m80
 Movie : halloween h20: 20 years later
Test Document (1223): «ll be damned do know you mind if we s

In [46]:
# compare results from three characters that have conversations in a movie and a random character
# Star wars : Hans, Luke and Vader
# u7821 +++$+++ HAN +++$+++ m529
display_character_similarity(7821, False)

# u7824 +++$+++ LUKE +++$+++ m529
display_character_similarity(7824, False)

# u7827 +++$+++ VADER
display_character_similarity(7827, True)

# random character from test corpus
display_character_similarity(random.randint(0, len(test_corpus)), False)

Character : HAN
 ID : m529
 Movie : star wars
[(92, 0.7802433967590332), (602, 0.6923903822898865), (85, 0.6741838455200195), (60, 0.658081591129303), (501, 0.6443480849266052), (460, 0.6412927508354187), (110, 0.6304922103881836), (64, 0.6292725205421448), (270, 0.6260251998901367), (448, 0.6246729493141174)]

Top 5
 house of 1000 corpses
 what women want
 hellbound: hellraiser ii
 fear and loathing in las vegas
 the salton sea

('star wars', 480, 0.20733563601970673) 

Character : LUKE
 ID : m529
 Movie : star wars


[(92, 0.938231348991394), (460, 0.6798879504203796), (184, 0.6715690493583679), (64, 0.6685425639152527), (217, 0.6623952984809875), (501, 0.6532403230667114), (148, 0.6326410174369812), (440, 0.6106000542640686), (538, 0.6011205911636353), (371, 0.5948300957679749)]

Top 5
 house of 1000 corpses
 a nightmare on elm street part 2: freddy's revenge
 slither
 friday the 13th
 there's something about mary

('star wars', 571, -0.07022307068109512) 

Character : VADER
 ID : m529
 Movie : star wars
Test Document (7827): «enough already know about the data you ve intercepted but its too late whatever information you ve gathered will be destroyed you will come to know such suffering as only the master of the bogan force can provide you ll get no information from me you have no authority the council can hold me it appears your ship had an accident will see to it that your death is duely reported there will be no one to save you this time the death star has become operational there is no force 


Top 5
 the adventures of ford fairlane
 the salton sea
 the black dahlia
 the rock
 what women want

('star wars', 591, 0.04749935865402222) 

Character : BURKE
 ID : m15
 Movie : aliens
[(118, 0.9850348234176636), (493, 0.7783792018890381), (159, 0.7692041397094727), (608, 0.7578772902488708), (340, 0.7540571689605713), (466, 0.7441763877868652), (480, 0.7358413934707642), (113, 0.7269444465637207), (164, 0.7138222455978394), (328, 0.6932591199874878)]



Top 5
 legend
 romeo and juliet
 pirates of the caribbean
 willow
 excalibur

('aliens', 178, 0.2509397864341736) 



In [47]:
# two characters that only have conversations with each other so they have identical words in their document
display_character_similarity(5519, True)
display_character_similarity(5508, True)

Character : PETER
 ID : m367
 Movie : get carter
Test Document (5519): «gerald phoned us in the middle of the night said he heard you ve been making nuisance of yourself we ve got to take you back to london he said it be doing him big favour we know why you re all steamed up and so do gerald and sid but they have to be diplomatic put it away jack you know you won use it the gun he means gerald wants to see him first shut up»

[(602, 0.7370354533195496), (270, 0.7155796885490417), (484, 0.6413251161575317), (459, 0.6298543214797974), (456, 0.6193528771400452), (486, 0.6151827573776245), (521, 0.6123079657554626), (85, 0.5797296166419983), (359, 0.5704122185707092), (338, 0.5603929162025452)]

Top 5
 what women want
 the black dahlia
 vampyr
 the nightmare before christmas
 the negotiator

('get carter', 166, 0.4120137393474579) 

Character : CON
 ID : m367
 Movie : get carter
Test Document (5508): «gerald phoned us in the middle of the night said he heard you ve been making nuisance of 


Top 5
 what women want
 the black dahlia
 quantum project
 seven days to live
 i am legend

('get carter', 171, 0.42591947317123413) 



In [48]:
# two characters that only have conversations with each other so they have identical words in their document
display_character_similarity(259, True)
display_character_similarity(265, True)

Character : PAPAGENO
 ID : m16
 Movie : amadeus
Test Document (259): «here am my angel what who the devil are you ve taken pity on you my angel heard your wish oh well thank you how wonderful some people get all the luck now you ve got to promise me faithfully you ll remain true to me forever then you ll see how tenderly your little birdie will love you can wait well promise then what do you mean now of course now right away before get any older well don know mean you re delicious delightful delectable little bird but don you think you might be just little tough oh tender enough for you my boy tender enough for you»

[(270, 0.6591387987136841), (602, 0.647212564945221), (369, 0.6006808876991272), (364, 0.5874727964401245), (484, 0.5859410166740417), (203, 0.5789071917533875), (609, 0.5529367923736572), (456, 0.5523012280464172), (389, 0.546088695526123), (95, 0.5448217391967773)]

Top 5
 the black dahlia
 what women want
 the godfather: part ii
 gandhi
 vampyr

('amadeus', 160, 0.39930

The Doc2Vec algorithm starts by giving distinct document-IDs an initial random vector; also most training modes include some randomized steps. So even identical runs-of-words won't necessarily result in identically-trained vectors. Rather, they'll tend to become closer over training – perhaps arbitrarily close with enough passes, but never identical. - https://groups.google.com/forum/#!topic/gensim/LLmPa4LECXs