# Machine Learning Nanodegree Capstone
# Chat Room Recommendation Benchmark

In [3]:
import os

### Load Data

In [4]:
# path will need to be changed pending on where the repo is cloned to
os.chdir(os.path.expanduser('~/PycharmProjects/chat-room-recommendation/'))
lines = open('cornell-movie-dialogs-corpus/movie_lines.txt','r').read().split('\n')
conv_lines = open('cornell-movie-dialogs-corpus/movie_conversations.txt','r').read().split('\n')
character_metadata = open('cornell-movie-dialogs-corpus/movie_characters_metadata.txt','r').read().split('\n')
movie_metadata = open('cornell-movie-dialogs-corpus/movie_titles_metadata.txt','r').read().split('\n')

In [5]:
from gensim import utils
movieLines = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        if _line[2] in movieLines: 
            movieLines[_line[2]] = movieLines.get(_line[2]) + utils.simple_preprocess(_line[4])
        else:
            movieLines[_line[2]] = utils.simple_preprocess(_line[4])

Using TensorFlow backend.


In [5]:
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [7]:
characterConversations = {}
for line in conv_lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 4:
        _line[3] = _line[3].strip("[]")
        for conv in _line[3].split(","):
            conv = conv.replace("'","").replace(" ", "")
            if _line[0] in characterConversations:
                characterConversations[_line[0]] = characterConversations.get(_line[0]) + \
                                                   utils.simple_preprocess(id2line.get(str(conv)))
            else:
                characterConversations[_line[0]] = utils.simple_preprocess(id2line.get(str(conv)))

            if _line[1] in characterConversations:
                characterConversations[_line[1]] = characterConversations.get(_line[1]) + \
                                                   utils.simple_preprocess(id2line.get(str(conv)))
            else:
                characterConversations[_line[1]] = utils.simple_preprocess(id2line.get(str(conv)))

In [8]:
# function that creates corpus (list of TaggedDocmuments) from dictionaries
from gensim.models.doc2vec import TaggedDocument


def create_corpus(dictname):
    corpus_list =[]
    for key, value in dictname.iteritems():
        corpus_list.append(TaggedDocument(value, [int(key[1:])]))
    return corpus_list

In [9]:
train_corpus = create_corpus(movieLines)
test_corpus = create_corpus(characterConversations)

Instantiate a Doc2Vec Object
Load model saved from chatroom_recommendation_capstone.ipynb

In [10]:
# load model if saved during a previous session
from gensim.models import Doc2Vec

model = Doc2Vec.load('/tmp/movie_model.doc2vec')

## Utility functions

In [11]:
# get character metadata by id
def get_character_metadata(id):
    for character in character_metadata:
        _character = character.split(' +++$+++ ')
        if len(_character) == 6:
            if _character[0] == 'u'+str(id):
                return _character
        
           
def get_movie_title(id):
    for movie in movie_metadata:
        _movie = movie.split(' +++$+++ ')
        if len(_movie) == 6:
            if _movie[0] == 'm'+str(id):
                return _movie[1]
            
            
# create def to look up tag doc in corpus by tag id
def get_corpus_index(corpus, tag):
    for tag_index in range(len(corpus)):
        if corpus[tag_index].tags[0] == tag:
            return tag_index
        
        
def get_movie_genres(id):
    for movie in movie_metadata:
        _movie = movie.split(' +++$+++ ')
        if len(_movie) == 6:
            if _movie[0] == 'm'+str(id):
                return _movie[5]

In [12]:
# find similarity of movie character is from and index 
def get_source_similarity(sim_list, movie_id):
    for sim_index in range(len(sim_list)):
        if int(movie_id[1:]) == sim_list[sim_index][0]:
            return get_movie_title(sim_list[sim_index][0]), sim_index, sim_list[sim_index][1]
        
        
# print top 5 recommended movies
def get_recommended_movies(sim_list):
    print('\nTop 5\n {}\n {}\n {}\n {}\n {}\n'.format(get_movie_title(sim_list[0][0]), get_movie_title(sim_list[1][0]), 
                                                get_movie_title(sim_list[2][0]), get_movie_title(sim_list[3][0]),
                                                get_movie_title(sim_list[4][0])))
    
    
def display_character_similarity(char_id, show_words):
    inferred_vector = model.infer_vector(test_corpus[char_id].words)
    test_sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    character_metadata = get_character_metadata(char_id)
    print('Character : {}\n ID : {}\n Movie : {}'.format(character_metadata[1], character_metadata[2], 
                                                         character_metadata[3]))
    if show_words :
        print('Test Document ({}): «{}»\n'.format(char_id, ' '.join(test_corpus[get_corpus_index(test_corpus, char_id)].words)))
    print test_sims[:10]
    get_recommended_movies(test_sims)
    print get_source_similarity(test_sims, character_metadata[2]), '\n'
    
    
def get_doc2vec_similarity(char_id):
    inferred_vector = model.infer_vector(test_corpus[char_id].words)
    test_sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

    return test_sims[:10]

In [13]:
# quick test of loaded model
inferred_vector = model.infer_vector(train_corpus[get_corpus_index(train_corpus, 529)].words)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

print('Movie title : {}\nMovie id : {}\n'.format(get_movie_title(train_corpus[get_corpus_index(train_corpus, 529)].tags[0]), train_corpus[get_corpus_index(train_corpus, 529)].tags[0]))
print(u'MODEL %s:\n' % model)

print get_movie_title(train_corpus[get_corpus_index(train_corpus, sims[0][0])].tags[0])
print sims[0]

Movie title : star wars
Movie id : 529

MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

star wars
(529, 0.9877308011054993)


## Benchmark
A content-based recommendation system will be used a benchmark model. TF-IDF (Term Frequency - Inverse Document Frequency) will be used to parse all of the conversations for each character and also all of the conversations for each movie. Cosine similarity will be used to determine which conversations are closest to each other

In [14]:
tfidf_movie_lines = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        _write_line = _line[4].replace(",", "").decode('iso-8859-1').encode('utf8')
        if _line[2] in tfidf_movie_lines: 
            tfidf_movie_lines[_line[2]] = tfidf_movie_lines.get(_line[2]) + _write_line
        else:
            tfidf_movie_lines[_line[2]] = _write_line

In [15]:
# sanity check that there are the appropriate number of movies in the tfidf_movie_lines dict
print "Correct # of movies" if len(tfidf_movie_lines) == 617 else "something went wrong with movieLines dict"

Correct # of movies


In [16]:
def create_tfidf_corpus(dictname):
    corpus_str = """id,description"""
    for key, value in dictname.iteritems():
        corpus_str += '\n' + key[1:] + ',' + value 
    return corpus_str

In [17]:
tfidf_train_corpus = create_tfidf_corpus(tfidf_movie_lines)

In [18]:
def tfidf_get_character_lines(char_id):  
    character_conv = {}
    char_id = 'u' + str(char_id)
    for line in conv_lines:
        _line = line.split(' +++$+++ ')
        if len(_line) == 4:
            _line[3] = _line[3].strip("[]")
            for conv in _line[3].split(","):
                conv = conv.replace("'","").replace(" ", "")
                _char_write_line = id2line.get(str(conv)).replace(",", "").decode('iso-8859-1').encode('utf8')
                if _line[0] == char_id:
                    if _line[0] in character_conv:
                        character_conv[_line[0]] = character_conv.get(_line[0]) + _char_write_line
                    else:
                        character_conv[_line[0]] = _char_write_line
                if _line[1] == char_id:
                    if _line[1] in character_conv:
                        character_conv[_line[1]] = character_conv.get(_line[1]) + _char_write_line
                    else:
                        character_conv[_line[1]] = _char_write_line
    char_str = """"""                    
    for key, value in character_conv.iteritems():
        char_str += '\n' + key[1:] + ',' + value
    return char_str

In [19]:
print tfidf_get_character_lines(1220)


1220,Besides it's historically inaccurate.What the fuck are you talking about?Michael Meyers never used a meat cleaver.  It was a butcher knife.Who are you the serial killer police?  What difference does it make?It's not historically accurate that's all.Another historical inaccuracy.Would somebody shut this guy up?


In [20]:
tmp_train_corpus = tfidf_train_corpus + tfidf_get_character_lines(1220)

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from StringIO import StringIO


sim_list = []


def _train(ds):

    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(ds['description'])
    
    cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

    for idx, row in ds.iterrows():
        similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
        similar_items = [(ds['id'][i], cosine_similarities[idx][i]) for i in similar_indices]

        # First item is the item itself, so remove it.
        sim_list.append([row['id'], similar_items[1:]])    


def predict(item_id, num):
    for sim in sim_list:
        if item_id == sim[0]:
            if num < len(sim[1]):
                return sim[1][:num]
            else:
                return sim[1]

In [22]:
# Train tf-idf for character 1220
test_data = StringIO(tmp_train_corpus)
ds = pd.read_csv(test_data)
_train(ds)

In [23]:
test_id = 1220
tfidf_result = predict(test_id, 10)
doc2vec_result = get_doc2vec_similarity(test_id)

test_character_metadata = get_character_metadata(test_id)
print("Character {} from {} {}".format(test_character_metadata[1], test_character_metadata[3], 
                                       get_movie_genres(int(test_character_metadata[2][1:]))))

display_results = []
display_results.append(["Character ID :" + str(test_id), "tf-idf", "genre", "doc2vec", "genre"])

for idx, item in enumerate(doc2vec_result):   
    display_results.append(['Recommendation ' + str(idx+1), get_movie_title(tfidf_result[idx][0]), get_movie_genres(tfidf_result[idx][0]), get_movie_title(item[0]), get_movie_genres(item[0])])

for item in display_results:
    print(item[0]+" "*(20-len(item[0]))+"|"+
          item[1]+" "*(50-len(item[1]))+"|"+
          item[2]+" "*(50-len(item[2]))+"|"+
          item[3]+" "*(50-len(item[3]))+"|"+
          item[4])

Character EDDIE from halloween h20: 20 years later ['drama', 'horror', 'thriller']


Character ID :1220  |tf-idf                                            |genre                                             |doc2vec                                           |genre
Recommendation 1    |halloween h20: 20 years later                     |['drama', 'horror', 'thriller']                   |halloween                                         |['horror', 'thriller']
Recommendation 2    |the witching hour                                 |['documentary', 'short']                          |what women want                                   |['comedy', 'fantasy', 'romance']
Recommendation 3    |copycat                                           |['crime', 'mystery', 'thriller']                  |a nightmare on elm street 4: the dream master     |['fantasy', 'horror', 'thriller']
Recommendation 4    |the lost boys                                     |['comedy', 'fantasy', 'horror', 'thriller']       |a nightmare on elm street part 2: freddy's revenge|['fantasy', 'horror', 'thriller']


In [24]:
# 1223
tmp_train_corpus = tfidf_train_corpus + tfidf_get_character_lines(1223)
test_data = StringIO(tmp_train_corpus)
ds = pd.read_csv(test_data)
_train(ds)

In [25]:
test_id = 1223
tfidf_result = predict(test_id, 10)
doc2vec_result = get_doc2vec_similarity(test_id)

test_character_metadata = get_character_metadata(test_id)
print("Character {} from {} {}".format(test_character_metadata[1], test_character_metadata[3], 
                                       get_movie_genres(int(test_character_metadata[2][1:]))))
display_results = []
display_results.append(["Character ID :" + str(test_id), "tf-idf", "genre", "doc2vec", "genre"])

for idx, item in enumerate(doc2vec_result):   
    display_results.append(['Recommendation ' + str(idx+1), get_movie_title(tfidf_result[idx][0]), get_movie_genres(tfidf_result[idx][0]), 
                            get_movie_title(item[0]), get_movie_genres(item[0])])

for item in display_results:
    print(item[0]+" "*(20-len(item[0]))+"|"+
          item[1]+" "*(50-len(item[1]))+"|"+
          item[2]+" "*(50-len(item[2]))+"|"+
          item[3]+" "*(50-len(item[3]))+"|"+
          item[4])

Character KERI from halloween h20: 20 years later ['drama', 'horror', 'thriller']


Character ID :1223  |tf-idf                                            |genre                                             |doc2vec                                           |genre
Recommendation 1    |halloween h20: 20 years later                     |['drama', 'horror', 'thriller']                   |midnight express                                  |['biography', 'crime', 'drama', 'thriller']
Recommendation 2    |sex, lies, and videotape                          |['drama']                                         |the black dahlia                                  |['crime', 'drama', 'history', 'mystery', 'thriller']
Recommendation 3    |hannah and her sisters                            |['comedy', 'drama', 'romance']                    |the getaway                                       |['action', 'adventure', 'crime', 'thriller']
Recommendation 4    |u-turn                                            |['drama']                                         |what women want                  

In [26]:
# u7821 +++$+++ HAN +++$+++ m529
test_id = 7821
tmp_train_corpus = tfidf_train_corpus + tfidf_get_character_lines(7821)
test_data = StringIO(tmp_train_corpus)
ds = pd.read_csv(test_data)
_train(ds)

tfidf_result = predict(test_id, 10)
doc2vec_result = get_doc2vec_similarity(test_id)

test_character_metadata = get_character_metadata(test_id)
print("Character {} from {} {}".format(test_character_metadata[1], test_character_metadata[3], 
                                       get_movie_genres(int(test_character_metadata[2][1:]))))
display_results = []
display_results.append(["Character ID :" + str(test_id), "tf-idf", "genre", "doc2vec", "genre"])

for idx, item in enumerate(doc2vec_result):   
    display_results.append(['Recommendation ' + str(idx+1), get_movie_title(tfidf_result[idx][0]), get_movie_genres(tfidf_result[idx][0]), 
                            get_movie_title(item[0]), get_movie_genres(item[0])])

for item in display_results:
    print(item[0]+" "*(20-len(item[0]))+"|"+
          item[1]+" "*(50-len(item[1]))+"|"+
          item[2]+" "*(50-len(item[2]))+"|"+
          item[3]+" "*(50-len(item[3]))+"|"+
          item[4])
    
# u7824 +++$+++ LUKE +++$+++ m529
test_id = 7824
tmp_train_corpus = tfidf_train_corpus + tfidf_get_character_lines(7824)
test_data = StringIO(tmp_train_corpus)
ds = pd.read_csv(test_data)
_train(ds)

tfidf_result = predict(test_id, 10)
doc2vec_result = get_doc2vec_similarity(test_id)

test_character_metadata = get_character_metadata(test_id)
print("Character {} from {} {}".format(test_character_metadata[1], test_character_metadata[3], 
                                       get_movie_genres(int(test_character_metadata[2][1:]))))
display_results = []
display_results.append(["Character ID :" + str(test_id), "tf-idf", "genre", "doc2vec", "genre"])

for idx, item in enumerate(doc2vec_result):   
    display_results.append(['Recommendation ' + str(idx+1), get_movie_title(tfidf_result[idx][0]), get_movie_genres(tfidf_result[idx][0]), 
                            get_movie_title(item[0]), get_movie_genres(item[0])])

for item in display_results:
    print(item[0]+" "*(20-len(item[0]))+"|"+
          item[1]+" "*(50-len(item[1]))+"|"+
          item[2]+" "*(50-len(item[2]))+"|"+
          item[3]+" "*(50-len(item[3]))+"|"+
          item[4])
    
# random character
# test_id = random.randint(0, len(test_corpus))
# tmp_train_corpus = tfidf_train_corpus + tfidf_get_character_lines(test_id)
# test_data = StringIO(tmp_train_corpus)
# ds = pd.read_csv(test_data)
# _train(ds)
# tfidf_result = predict(test_id, 10)
# doc2vec_result = get_doc2vec_similarity(test_id)
# 
# test_character_metadata = get_character_metadata(test_id)
# print("Character {} from {} {}".format(test_character_metadata[1], test_character_metadata[3], 
#                                        get_movie_genres(int(test_character_metadata[2][1:]))))
# display_results = []
# display_results.append(["Character ID :" + str(test_id), "tf-idf", "genre", "doc2vec", "genre"])
# 
# for idx, item in enumerate(doc2vec_result):   
#     display_results.append(['Recommendation ' + str(idx+1), get_movie_title(tfidf_result[idx][0]), get_movie_genres(tfidf_result[idx][0]), 
#                             get_movie_title(item[0]), get_movie_genres(item[0])])
# 
# for item in display_results:
#     print(item[0]+" "*(20-len(item[0]))+"|"+
#           item[1]+" "*(50-len(item[1]))+"|"+
#           item[2]+" "*(50-len(item[2]))+"|"+
#           item[3]+" "*(50-len(item[3]))+"|"+
#           item[4])

Character HAN from star wars ['action', 'adventure', 'fantasy', 'sci-fi']
Character ID :7821  |tf-idf                                            |genre                                             |doc2vec                                           |genre
Recommendation 1    |star wars                                         |['action', 'adventure', 'fantasy', 'sci-fi']      |house of 1000 corpses                             |['horror']
Recommendation 2    |star wars: the empire strikes back                |['animation', 'adventure', 'action', 'fantasy']   |what women want                                   |['comedy', 'fantasy', 'romance']
Recommendation 3    |star wars: episode vi - return of the jedi        |['action', 'adventure', 'fantasy', 'sci-fi']      |hellbound: hellraiser ii                          |['drama', 'horror', 'thriller']
Recommendation 4    |u-turn                                            |['drama']                                         |the black dahlia         


Recommendation 7    |three kings                                       |['action', 'adventure', 'comedy', 'drama', 'war'] |mulholland dr.                                    |['drama', 'mystery', 'thriller']
Recommendation 8    |smoke                                             |['comedy', 'drama']                               |a nightmare on elm street part 2: freddy's revenge|['fantasy', 'horror', 'thriller']
Recommendation 9    |the horse whisperer                               |['drama', 'romance', 'western']                   |halloween                                         |['horror', 'thriller']
Recommendation 10   |shampoo                                           |['drama', 'romance']                              |the salton sea                                    |['crime', 'drama', 'mystery', 'thriller']
Character LUKE from star wars ['action', 'adventure', 'fantasy', 'sci-fi']


Character ID :7824  |tf-idf                                            |genre                                             |doc2vec                                           |genre
Recommendation 1    |star wars                                         |['action', 'adventure', 'fantasy', 'sci-fi']      |house of 1000 corpses                             |['horror']
Recommendation 2    |star wars: the empire strikes back                |['animation', 'adventure', 'action', 'fantasy']   |slither                                           |['comedy', 'horror', 'sci-fi']
Recommendation 3    |star wars: episode vi - return of the jedi        |['action', 'adventure', 'fantasy', 'sci-fi']      |there's something about mary                      |['comedy', 'romance']
Recommendation 4    |broadcast news                                    |['comedy', 'drama', 'romance']                    |a nightmare on elm street part 2: freddy's revenge|['fantasy', 'horror', 'thriller']
Recommendation 5    |the h

In [27]:
print('Test Document ({}): «{}»\n'.format(7821, ' '.join(test_corpus[get_corpus_index(test_corpus, 7821)].words)))

print('Test Document ({}): «{}»\n'.format(529, ' '.join(train_corpus[get_corpus_index(train_corpus, 529)].words)))

print('Test Document ({}): «{}»\n'.format(92, ' '.join(train_corpus[get_corpus_index(train_corpus, 92)].words)))

Test Document (7821): «han solo captain of this vessel who in charge then ben kenobi luke starkiller here is leading our expedition well we ll see you might get your stuff together we ll be coming up on organa major soon the empire must have gotten here first the planet has been totally blown away it would have taken thousand ships with lot more fire power than ve ever seen if the empire had new weapon that could do this would have heard something know about it well now you know the enemy is on the move we haven much time well ve brought you here what now we have to find he rebels what we re carrying belongs to them their bases are very well hidden all the power of the empire can find them do you know where they are no not anymore not taking you on an impossible search across the galaxy was only paid to get you here and now you re here ll take my other five thousand and you re on your own ll leave you on the nearest system well for one reason we don have your other five thousand who go

In [28]:
# u163 +++$+++ m11 +++$+++ IRAQI SOLDIER
test_id = 947
tmp_train_corpus = tfidf_train_corpus + tfidf_get_character_lines(947)
test_data = StringIO(tmp_train_corpus)
ds = pd.read_csv(test_data)
_train(ds)

tfidf_result = predict(test_id, 10)
doc2vec_result = get_doc2vec_similarity(test_id)

test_character_metadata = get_character_metadata(test_id)
print("Character {} from {}".format(test_character_metadata[1], test_character_metadata[3]))

display_results = []
display_results.append(["Character ID :" + str(test_id), "tf-idf", "doc2vec"])

for idx, item in enumerate(doc2vec_result):   
    display_results.append(['Recommendation ' + str(idx+1), get_movie_title(tfidf_result[idx][0]), get_movie_title(item[0])])

for item in display_results:
    print(item[0]+" "*(20-len(item[0]))+"|"+
          item[1]+" "*(40-len(item[1]))+"|"+
          item[2])

Character COP from frances
Character ID :947   |tf-idf                                  |doc2vec
Recommendation 1    |frances                                 |detroit rock city
Recommendation 2    |the lost son                            |sugar & spice
Recommendation 3    |casino                                  |drop dead gorgeous
Recommendation 4    |suburbia                                |bull durham
Recommendation 5    |my girl 2                               |monkeybone
Recommendation 6    |magnolia                                |hudson hawk
Recommendation 7    |the big lebowski                        |stepmom
Recommendation 8    |u turn                                  |heathers
Recommendation 9    |punch-drunk love                        |love & basketball
Recommendation 10   |barton fink                             |south park: bigger longer & uncut
