In [37]:
#% prehaces a magic function - matplotlib plots will be produced within the notebook 
%matplotlib inline

In [38]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [39]:
import gensim
import pandas as pd
import gensim, spacy, logging, warnings
from gensim.models.doc2vec import Doc2Vec, TaggedDocument 
from gensim.utils import lemmatize, simple_preprocess
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import re 

In [45]:
#importing training data
df = pd.read_csv('C:/Users/patri/NLP/tagged_plots_movielens.csv')
train_file = df['plot']

train_file.head()

0    A little boy named Andy loves to be in his roo...
1    When two kids find and play a magical board ga...
2    Things don't seem to change much in Wabasha Co...
3    Hunters and their prey--Neil and his professio...
4    An ugly duckling having undergone a remarkable...
Name: plot, dtype: object

In [46]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

In [50]:
#importing test data
df2 = pd.read_csv('C:/Users/patri/NLP/movie_plots_test.csv')
test_file = df2['plot']

In [51]:
train_corpus = list(train_file)
test_corpus = list(test_file)

In [52]:
print(train_corpus[:2])

['A little boy named Andy loves to be in his room, playing with his toys, especially his doll named "Woody". But, what do the toys do when Andy is not with them, they come to life. Woody believes that he has life (as a toy) good. However, he must worry about Andy\'s family moving, and what Woody does not know is about Andy\'s birthday party. Woody does not realize that Andy\'s mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a toy, and quickly becomes Andy\'s new favorite toy. Woody, who is now consumed with jealousy, tries to get rid of Buzz. Then, both Woody and Buzz are now lost. They must find a way to get back to Andy before he moves without them, but they will have to pass through a ruthless toy killer, Sid Phillips.', 'When two kids find and play a magical board game, they release a man trapped for decades in it and a host of dangers that can only be stopped by finishing the game.']


In [53]:
print(test_corpus[:2])

["Jules Daly is struggling to raise her orphaned niece and nephew (Maggie and Milo) alone, but it isn't easy after getting downsized out of her antique sales job while Milo rebels against the death of his parents through petty theft. With things looking bleak for Christmas, an English butler named Paisley arrives with an invitation for all to come see the kids' emotionally distant grandfather who lives in Castlebury Hall, somewhere near Liechtenstein. With nothing to hold them back, they go, but the grandfather - Edward, Duke of Castlebury - is rather cold over their visit to his castle. So is his other surviving son, Ashton, Prince of Castlebury. Before long, they're all having a good time and looking forward to hosting a Christmas Eve ball, but Jules overhears a conversation from which she draws a wrong conclusion.", nan]


In [57]:
#create a lemmatizer function - simply returns words to their root form
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [58]:
#use list comprehension to create a function that tokenizes and makes each word lowercase
tokenized_train_doc = []
for d in train_corpus:
    tokenized_train_doc.append(word_tokenize(d.lower()))
tokenized_train_doc

AttributeError: 'float' object has no attribute 'lower'

In [56]:
#we leave the test document as a list of lists; it should not be converted into gensim format
tokenized_test_doc = []
for d in test_corpus:
    tokenized_test_doc.append(word_tokenize(d.lower()))
tokenized_test_doc#we leave the test document as a list of lists; it should not be converted into gensim format
tokenized_test_doc = []
for d in test_corpus:
    tokenized_test_doc.append(word_tokenize(d.lower()))
tokenized_test_doc

AttributeError: 'float' object has no attribute 'lower'

In [26]:
# Convert tokenized training document into gensim formated tagged data
tagged_train_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_train_doc)]
tagged_train_data[:2]

[TaggedDocument(words=['adrian', ',', 'on', 'a', 'recent', 'call', 'review', 'you', 'could', 'have', 'asked', 'a', 'better', 'question', 'of', 'the', 'caller', '.', 'this', 'has', 'occurred', 'before', 'where', 'the', 'caller', 'does', 'not', 'quite', 'understand', 'the', 'question', 'leading', 'to', 'longer', 'call', 'times', '.'], tags=[0]),
 TaggedDocument(words=['adrian', ',', 'your', 'constructive', 'participation', 'in', 'skype', 'on', 'two', 'calls', 'alerted', 'colleagues', 'to', 'situations', 'enabling', 'their', 'efficient', 'handling', 'of', 'the', 'calls', '.'], tags=[1])]

In [59]:
#Now, we'll instantiate a Doc2Vec model
#first parameter is your tags
#second parameter,vector size, is the number of results the model will return
#the third parameter,window, idk yet 
#fourth parameter,min_count,is the number of times a word must occur to not be discarded; apparently infrequently occurring words can impact model performance negatively
#fifth parameter, workers=cores , use these many worker threads to train the model (=faster training with multicore machines).

#Can also pull in the following parameter
#If dm=0, distributed bag of words (PV-DBOW) is used; if dm=1,‘distributed memory’ (PV-DM) is used.
#min_count=2, ignores all words with total frequency lower than this.
#negative=5 , specifies how many “noise words” should be drawn.
#hs=0 , and negative is non-zero, negative sampling will be used.
#sample=0 , the threshold for configuring which higher-frequency words are randomly down sampled.

model = Doc2Vec(tagged_train_data, vector_size=10, window=2, min_count=2, workers=4)

2020-04-03 20:43:47,987 : INFO : collecting all words and their counts
2020-04-03 20:43:47,989 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-04-03 20:43:48,246 : INFO : collected 8661 word types and 6545 unique tags from a corpus of 6545 examples and 363678 words
2020-04-03 20:43:48,250 : INFO : Loading a fresh vocabulary
2020-04-03 20:43:48,273 : INFO : effective_min_count=2 retains 6115 unique words (70% of original 8661, drops 2546)
2020-04-03 20:43:48,275 : INFO : effective_min_count=2 leaves 361132 word corpus (99% of original 363678, drops 2546)
2020-04-03 20:43:48,325 : INFO : deleting the raw counts dictionary of 8661 items
2020-04-03 20:43:48,328 : INFO : sample=0.001 downsamples 49 most-common words
2020-04-03 20:43:48,332 : INFO : downsampling leaves estimated 234870 word corpus (65.0% of prior 361132)
2020-04-03 20:43:48,367 : INFO : estimated required memory for 6115 words and 10 dimensions: 3808500 bytes
2020-04-03 20:43:48,372 : INF

In [62]:
#we can use the trained model to infer a vector for any series of tokens based on cosine similarity
vector = model.infer_vector(['Lucas', 'you', 'are', 'a', 'bro', 'fires'])
print(vector)

[-0.00110894  0.02244658  0.02449248 -0.00632843 -0.03135409 -0.00023116
  0.01906535 -0.04960615 -0.01973859 -0.01960687]


In [63]:
vector = model.infer_vector(['This', 'was', 'a', 'great','call','review'])
print(vector)

[ 0.00934602 -0.01839791 -0.02816632  0.02503777 -0.12125958  0.06920887
  0.01105353 -0.10474823  0.08355144 -0.08956536]


In [30]:
vector = model.infer_vector(['Thanks', 'for', 'the', 'constructive','feedback'])
print(vector)

[ 0.03782747  0.06630251  0.05836515 -0.02334113  0.01048507  0.00764417
 -0.0926734  -0.04646827  0.01865775 -0.00746181]


In [None]:
#we'll first infer new vectors for each document of the training corpus, compare the inferred vectors with the training corpus, and then returning the rank of the document based on self-similarity.
ranks = []
second_ranks = []
for doc_id in range(len(tagged_train_data)):
    inferred_vector = model.infer_vector(tagged_train_data[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [None]:
#we use a counter to evaluate how many times equivalent values are added
#I believe these results are saying that the first ranking similarity document is returned most frequently, the second most similar document is returned the second most frequently, which is a good sign 
import collections

counter = collections.Counter(ranks)
print(counter)

In [66]:
#similar and dissimilar documents
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(tagged_train_data[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(tagged_train_data[sims[index][0]].words)))

Document (6544): «call review 10/19/19 1229pm . good inbound hotcard call . you were able to get all of the information necessary in order to process this lost card claim .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d10,n5,w2,mc2,s0.001,t4):

MOST (5924, 0.9840618371963501): «call review 9/6/19 4:52pm . keeps conversation in scope , you identified fraud , informed of next steps , and moved on . efficient call .»

SECOND-MOST (1723, 0.9803169965744019): «call review 3/16/19 938pm your usual high degree of professionalism came through on this call in terms of efficient and effective process , customer service , respect/courtesy and quality . great work .»

MEDIAN (2411, 0.7169733047485352): «call review 4/24/19 6:40pm - this is a call review of an outbound call where you left a message . this message was a bit hurried and rushed . please make sure you are n't rushing through these as it sounds less professional when we do .»

LEAST (1309, -0.6040962338447571): «call review 2/21/19 

In [67]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(tagged_train_data) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(tagged_train_data[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(tagged_train_data[sim_id[0]].words)))

Train Document (5094): «call review 8/10/19 1:48pm . manages customer dialogue efficiently . hc call , found the card , blocked it , checked last transaction . good»

Similar Document (4058, 0.9087874293327332): «call review 7/1/19 3:18pm . professionally managed the call , manages customer dialogue efficiently . you identified fraud , removed the block , informed of next steps , and moved on . efficient call .»



In [68]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(tokenized_test_doc) - 1)
inferred_vector = model.infer_vector(tokenized_test_doc[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))


# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(tokenized_test_doc[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(tagged_train_data[sims[index][0]].words)))

Test Document (0): «jules daly is struggling to raise her orphaned niece and nephew ( maggie and milo ) alone , but it is n't easy after getting downsized out of her antique sales job while milo rebels against the death of his parents through petty theft . with things looking bleak for christmas , an english butler named paisley arrives with an invitation for all to come see the kids ' emotionally distant grandfather who lives in castlebury hall , somewhere near liechtenstein . with nothing to hold them back , they go , but the grandfather - edward , duke of castlebury - is rather cold over their visit to his castle . so is his other surviving son , ashton , prince of castlebury . before long , they 're all having a good time and looking forward to hosting a christmas eve ball , but jules overhears a conversation from which she draws a wrong conclusion .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d10,n5,w2,mc2,s0.001,t4):

MOST (5276, 0.8755322098731995): «call review 08/25/19 93