In [37]:
#% prehaces a magic function - matplotlib plots will be produced within the notebook 
%matplotlib inline

In [38]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [39]:
import gensim
import pandas as pd
import gensim, spacy, logging, warnings
from gensim.models.doc2vec import Doc2Vec, TaggedDocument 
from gensim.utils import lemmatize, simple_preprocess
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import re 

In [83]:
#importing training data
df = pd.read_csv('C:/Users/patri/NLP/tagged_plots_movielens.csv')
train_file = df['plot'].astype(str)

train_file.head()

0    A little boy named Andy loves to be in his roo...
1    When two kids find and play a magical board ga...
2    Things don't seem to change much in Wabasha Co...
3    Hunters and their prey--Neil and his professio...
4    An ugly duckling having undergone a remarkable...
Name: plot, dtype: object

In [84]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

In [85]:
#importing test data
df2 = pd.read_csv('C:/Users/patri/NLP/movie_plots_test.csv')
test_file = df2['plot'].astype(str)

In [87]:
train_corpus = list(train_file)
test_corpus = list(test_file)

In [88]:
print(train_corpus[:2])

['A little boy named Andy loves to be in his room, playing with his toys, especially his doll named "Woody". But, what do the toys do when Andy is not with them, they come to life. Woody believes that he has life (as a toy) good. However, he must worry about Andy\'s family moving, and what Woody does not know is about Andy\'s birthday party. Woody does not realize that Andy\'s mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a toy, and quickly becomes Andy\'s new favorite toy. Woody, who is now consumed with jealousy, tries to get rid of Buzz. Then, both Woody and Buzz are now lost. They must find a way to get back to Andy before he moves without them, but they will have to pass through a ruthless toy killer, Sid Phillips.', 'When two kids find and play a magical board game, they release a man trapped for decades in it and a host of dangers that can only be stopped by finishing the game.']


In [89]:
print(test_corpus[:2])

["Jules Daly is struggling to raise her orphaned niece and nephew (Maggie and Milo) alone, but it isn't easy after getting downsized out of her antique sales job while Milo rebels against the death of his parents through petty theft. With things looking bleak for Christmas, an English butler named Paisley arrives with an invitation for all to come see the kids' emotionally distant grandfather who lives in Castlebury Hall, somewhere near Liechtenstein. With nothing to hold them back, they go, but the grandfather - Edward, Duke of Castlebury - is rather cold over their visit to his castle. So is his other surviving son, Ashton, Prince of Castlebury. Before long, they're all having a good time and looking forward to hosting a Christmas Eve ball, but Jules overhears a conversation from which she draws a wrong conclusion.", 'nan']


In [90]:
#create a lemmatizer function - simply returns words to their root form
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [91]:
#use list comprehension to create a function that tokenizes and makes each word lowercase
tokenized_train_doc = []
for d in train_corpus:
    tokenized_train_doc.append(word_tokenize(d.lower()))
tokenized_train_doc

[['a',
  'little',
  'boy',
  'named',
  'andy',
  'loves',
  'to',
  'be',
  'in',
  'his',
  'room',
  ',',
  'playing',
  'with',
  'his',
  'toys',
  ',',
  'especially',
  'his',
  'doll',
  'named',
  '``',
  'woody',
  "''",
  '.',
  'but',
  ',',
  'what',
  'do',
  'the',
  'toys',
  'do',
  'when',
  'andy',
  'is',
  'not',
  'with',
  'them',
  ',',
  'they',
  'come',
  'to',
  'life',
  '.',
  'woody',
  'believes',
  'that',
  'he',
  'has',
  'life',
  '(',
  'as',
  'a',
  'toy',
  ')',
  'good',
  '.',
  'however',
  ',',
  'he',
  'must',
  'worry',
  'about',
  'andy',
  "'s",
  'family',
  'moving',
  ',',
  'and',
  'what',
  'woody',
  'does',
  'not',
  'know',
  'is',
  'about',
  'andy',
  "'s",
  'birthday',
  'party',
  '.',
  'woody',
  'does',
  'not',
  'realize',
  'that',
  'andy',
  "'s",
  'mother',
  'gave',
  'him',
  'an',
  'action',
  'figure',
  'known',
  'as',
  'buzz',
  'lightyear',
  ',',
  'who',
  'does',
  'not',
  'believe',
  'that',
 

In [92]:
#we leave the test document as a list of lists; it should not be converted into gensim format
tokenized_test_doc = []
for d in test_corpus:
    tokenized_test_doc.append(word_tokenize(d.lower()))
tokenized_test_doc#we leave the test document as a list of lists; it should not be converted into gensim format
tokenized_test_doc = []
for d in test_corpus:
    tokenized_test_doc.append(word_tokenize(d.lower()))
tokenized_test_doc

[['jules',
  'daly',
  'is',
  'struggling',
  'to',
  'raise',
  'her',
  'orphaned',
  'niece',
  'and',
  'nephew',
  '(',
  'maggie',
  'and',
  'milo',
  ')',
  'alone',
  ',',
  'but',
  'it',
  'is',
  "n't",
  'easy',
  'after',
  'getting',
  'downsized',
  'out',
  'of',
  'her',
  'antique',
  'sales',
  'job',
  'while',
  'milo',
  'rebels',
  'against',
  'the',
  'death',
  'of',
  'his',
  'parents',
  'through',
  'petty',
  'theft',
  '.',
  'with',
  'things',
  'looking',
  'bleak',
  'for',
  'christmas',
  ',',
  'an',
  'english',
  'butler',
  'named',
  'paisley',
  'arrives',
  'with',
  'an',
  'invitation',
  'for',
  'all',
  'to',
  'come',
  'see',
  'the',
  'kids',
  "'",
  'emotionally',
  'distant',
  'grandfather',
  'who',
  'lives',
  'in',
  'castlebury',
  'hall',
  ',',
  'somewhere',
  'near',
  'liechtenstein',
  '.',
  'with',
  'nothing',
  'to',
  'hold',
  'them',
  'back',
  ',',
  'they',
  'go',
  ',',
  'but',
  'the',
  'grandfather',

In [93]:
# Convert tokenized training document into gensim formated tagged data
tagged_train_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_train_doc)]
tagged_train_data[:2]

[TaggedDocument(words=['a', 'little', 'boy', 'named', 'andy', 'loves', 'to', 'be', 'in', 'his', 'room', ',', 'playing', 'with', 'his', 'toys', ',', 'especially', 'his', 'doll', 'named', '``', 'woody', "''", '.', 'but', ',', 'what', 'do', 'the', 'toys', 'do', 'when', 'andy', 'is', 'not', 'with', 'them', ',', 'they', 'come', 'to', 'life', '.', 'woody', 'believes', 'that', 'he', 'has', 'life', '(', 'as', 'a', 'toy', ')', 'good', '.', 'however', ',', 'he', 'must', 'worry', 'about', 'andy', "'s", 'family', 'moving', ',', 'and', 'what', 'woody', 'does', 'not', 'know', 'is', 'about', 'andy', "'s", 'birthday', 'party', '.', 'woody', 'does', 'not', 'realize', 'that', 'andy', "'s", 'mother', 'gave', 'him', 'an', 'action', 'figure', 'known', 'as', 'buzz', 'lightyear', ',', 'who', 'does', 'not', 'believe', 'that', 'he', 'is', 'a', 'toy', ',', 'and', 'quickly', 'becomes', 'andy', "'s", 'new', 'favorite', 'toy', '.', 'woody', ',', 'who', 'is', 'now', 'consumed', 'with', 'jealousy', ',', 'tries', 'to

In [94]:
#Now, we'll instantiate a Doc2Vec model
#first parameter is your tags
#second parameter,vector size, is the number of results the model will return
#the third parameter,window, idk yet 
#fourth parameter,min_count,is the number of times a word must occur to not be discarded; apparently infrequently occurring words can impact model performance negatively
#fifth parameter, workers=cores , use these many worker threads to train the model (=faster training with multicore machines).

#Can also pull in the following parameter
#If dm=0, distributed bag of words (PV-DBOW) is used; if dm=1,‘distributed memory’ (PV-DM) is used.
#min_count=2, ignores all words with total frequency lower than this.
#negative=5 , specifies how many “noise words” should be drawn.
#hs=0 , and negative is non-zero, negative sampling will be used.
#sample=0 , the threshold for configuring which higher-frequency words are randomly down sampled.

model = Doc2Vec(tagged_train_data, vector_size=10, window=2, min_count=2, workers=4)

2020-04-05 18:30:03,377 : INFO : collecting all words and their counts
2020-04-05 18:30:03,378 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-04-05 18:30:03,409 : INFO : collected 16889 word types and 2115 unique tags from a corpus of 2115 examples and 171158 words
2020-04-05 18:30:03,410 : INFO : Loading a fresh vocabulary
2020-04-05 18:30:03,434 : INFO : effective_min_count=2 retains 8610 unique words (50% of original 16889, drops 8279)
2020-04-05 18:30:03,435 : INFO : effective_min_count=2 leaves 162879 word corpus (95% of original 171158, drops 8279)
2020-04-05 18:30:03,464 : INFO : deleting the raw counts dictionary of 16889 items
2020-04-05 18:30:03,465 : INFO : sample=0.001 downsamples 35 most-common words
2020-04-05 18:30:03,467 : INFO : downsampling leaves estimated 113038 word corpus (69.4% of prior 162879)
2020-04-05 18:30:03,489 : INFO : estimated required memory for 8610 words and 10 dimensions: 5078400 bytes
2020-04-05 18:30:03,490 : 

In [95]:
#we can use the trained model to infer a vector for any series of tokens based on cosine similarity
vector = model.infer_vector(['Lucas', 'you', 'are', 'a', 'bro', 'fires'])
print(vector)

[-0.00284184 -0.03406283 -0.01430843 -0.01899158 -0.04115798 -0.01812194
  0.01684552 -0.01134556 -0.03135644  0.00044523]


In [96]:
vector = model.infer_vector(['This', 'was', 'a', 'great','call','review'])
print(vector)

[ 0.03998432 -0.07028429 -0.02756258 -0.03654652 -0.03351866 -0.00374898
  0.01504331 -0.01682571  0.05810033 -0.03301215]


In [97]:
vector = model.infer_vector(['Thanks', 'for', 'the', 'constructive','feedback'])
print(vector)

[-0.00640345 -0.01948013  0.03357448 -0.04636478  0.01388736 -0.01138248
 -0.01969098 -0.00164647  0.02135162  0.02893843]


In [98]:
#we'll first infer new vectors for each document of the training corpus, compare the inferred vectors with the training corpus, and then returning the rank of the document based on self-similarity.
ranks = []
second_ranks = []
for doc_id in range(len(tagged_train_data)):
    inferred_vector = model.infer_vector(tagged_train_data[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

2020-04-05 18:30:16,321 : INFO : precomputing L2-norms of doc weight vectors


In [70]:
#we use a counter to evaluate how many times equivalent values are added
#I believe these results are saying that the first ranking similarity document is returned most frequently, the second most similar document is returned the second most frequently, which is a good sign 
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 142, 1: 121, 2: 102, 3: 94, 5: 77, 4: 75, 6: 61, 11: 52, 7: 50, 12: 50, 8: 50, 9: 46, 14: 41, 15: 39, 10: 38, 25: 38, 18: 37, 20: 37, 16: 33, 23: 33, 38: 33, 33: 32, 19: 30, 22: 30, 13: 29, 31: 27, 34: 27, 27: 26, 40: 25, 24: 25, 17: 25, 55: 24, 29: 24, 21: 23, 39: 23, 41: 23, 37: 22, 28: 22, 32: 22, 81: 22, 36: 22, 30: 22, 68: 21, 65: 20, 35: 20, 46: 20, 45: 20, 56: 19, 59: 19, 47: 19, 26: 18, 67: 18, 53: 18, 60: 18, 54: 18, 71: 17, 49: 17, 43: 17, 51: 17, 63: 17, 127: 17, 64: 16, 61: 16, 135: 15, 52: 15, 92: 15, 76: 15, 78: 15, 106: 15, 124: 15, 75: 15, 130: 15, 58: 14, 195: 14, 50: 14, 133: 14, 72: 14, 117: 14, 44: 14, 103: 13, 83: 13, 87: 13, 57: 13, 96: 13, 66: 13, 93: 13, 109: 12, 69: 12, 80: 12, 122: 12, 42: 12, 94: 12, 82: 12, 77: 12, 192: 12, 159: 12, 62: 12, 158: 12, 119: 11, 111: 11, 136: 11, 86: 11, 48: 11, 84: 11, 244: 11, 131: 11, 137: 10, 99: 10, 151: 10, 154: 10, 91: 10, 79: 10, 97: 10, 101: 10, 126: 10, 180: 10, 112: 10, 341: 9, 173: 9, 174: 9, 102: 9, 110:




In [99]:
#similar and dissimilar documents
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(tagged_train_data[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(tagged_train_data[sims[index][0]].words)))

Document (2114): «set in a future where a failed climate-change experiment kills all life on the planet except for a lucky few who boarded the snowpiercer , a train that travels around the globe , where a class system emerges .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d10,n5,w2,mc2,s0.001,t4):

MOST (547, 0.9961217641830444): «when a princess is shrunken by an evil wizard , sinbad must undertake a quest to an island of monsters to cure her and prevent a war .»

SECOND-MOST (1421, 0.9944809675216675): «the movie is an epic story of a young genghis khan and how events in his early life lead him to become a legendary conqueror . the 9-year-old temjin is taken on a trip by his father to select a girl as his future wife . he meets brte , who says she would like to be chosen , which he does . he promises to return after five years to marry her . temjin 's father is poisoned on the trip , and dies . as a boy temjin passes through starvation , humiliations and even slavery , but later 

In [100]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(tagged_train_data) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(tagged_train_data[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(tagged_train_data[sim_id[0]].words)))

Train Document (785): «skip and harry are framed for a bank robbery and end up in a western prison . the two eastern boys are having difficulty adjusting to the new life until the warden finds that skip has a natural talent for riding broncos with the inter-prison rodeo coming up .»

Similar Document (1708, 0.9942507147789001): «in 1989 , unwitting utah actors starred in the undisputed worst movie in history : troll 2. two decades later , the legendarily inept film 's child star unravels the improbable , heartfelt story of an alabama dentist-turned-cult movie icon and an italian filmmaker who come to terms with this genuine , internationally revered cinematic failure .»



In [101]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(tokenized_test_doc) - 1)
inferred_vector = model.infer_vector(tokenized_test_doc[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))


# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(tokenized_test_doc[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(tagged_train_data[sims[index][0]].words)))

Test Document (208): «in this hilarious and sexy stoner adventure , the blazed students and faculty of austin 's ladybird high attempt to shake off their slacker demons to stand up for their lifestyle and the eccentric city they love .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d10,n5,w2,mc2,s0.001,t4):

MOST (1816, 0.9901902079582214): «annie ( kristen wiig ) , is a maid of honor whose life unravels as she leads her best friend , lillian ( maya rudolph ) , and a group of colorful bridesmaids ( rose byrne , melissa mccarthy , wendi mclendon-covey and ellie kemper ) on a wild ride down the road to matrimony . annie 's life is a mess . but when she finds out her lifetime best friend is engaged , she simply must serve as lillian 's maid of honor . though lovelorn and broke , annie bluffs her way through the expensive and bizarre rituals . with one chance to get it perfect , she 'll show lillian and her bridesmaids just how far you 'll go for someone you love .»

MEDIAN (145, 0.96790