In [None]:
# Doc2Vec trial

In [1]:
import gensim
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize, punkt

#setting up libraries
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import pickle
import matplotlib.pyplot as plt
import time

In [2]:
# read dataframe, create limited dataframe for this trial, and print it

df = pd.read_pickle("../../../data/prd/Paper/FR_meta_and_final_tokens_23DEC21.pkl")
df.reset_index(inplace = True, drop = True)
data = df['final_tokens']
data_limited = data[:100]
print (data)
print (data_limited)

0          project explore game base metaphor enhanced ga...
1          institution franklin institute science museum ...
2          program include small group conversation citiz...
3          partnership american chemical society acs nati...
4          center molecular interfacing cmi enable integr...
                                 ...                        
1143864    circadian_rhythm fundamental endogenous proper...
1143865    recent research suggest learn genetic high sch...
1143866    covid_19_pandemic cause unprecedented disrupti...
1143867    subduction_zone location earth tectonic_plate ...
1143868    stretch mile illinois bank mississippi_river a...
Name: final_tokens, Length: 1143869, dtype: object
0     project explore game base metaphor enhanced ga...
1     institution franklin institute science museum ...
2     program include small group conversation citiz...
3     partnership american chemical society acs nati...
4     center molecular interfacing cmi enable integr..

In [3]:
#tokenize data (lowercase, tag each word to its original document)

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data_limited)]
print (tagged_data)

[TaggedDocument(words=['project', 'explore', 'game', 'base', 'metaphor', 'enhanced', 'game', 'design', 'game', 'method', 'apply', 'cognitive', 'science', 'metaphor', 'theory', 'design', 'computer', 'mediate', 'learning', 'environment', 'process', 'use', 'structure', 'mapping', 'theory', 'design', 'videogame', 'world', 'align', 'science', 'concept', 'rigorous', 'specification', 'procedure', 'map', 'relational', 'structure', 'targeted', 'concept', 'game', 'world', 'game', 'design', 'translate', 'target', 'concept', 'game', 'system', 'game', 'play', 'game', 'goal', 'relational', 'structure', 'game', 'world', 'design', 'analog', 'targeted', 'conceptual', 'domain', 'player', 'begin', 'construct', 'mental', 'model', 'target', 'concept', 'interactive', 'gameplay', 'make', 'learn', 'concrete', 'embody', 'gameplay', 'experience', 'design', 'guide', 'learner', 'discover', 'relational', 'structure', 'target', 'concept', 'gameplay', 'readiness', 'activity', 'prepare', 'learner', 'subsequent', 'ins

In [4]:
#initialize model, build vocabulary of tagged data, train model, save model 
#model is based on entire training set so would have to input subset of articles that relate to digitalization 

model = gensim.models.doc2vec.Doc2Vec(vector_size=10, min_count=3, epochs=10)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")

In [5]:
#apply model and find most similar document WITHIN TRAINING SET
#most similar defined by cosine similarity (positive words contribute positively towards the similarity, negative words negatively)
    #between a simple mean of the projection weight vectors of the given words and the vectors for each word in the model.

model = Doc2Vec.load("d2v.model")
similar_doc = model.dv.most_similar('0')

print(similar_doc[0:5])

[('50', 0.9983993768692017), ('38', 0.9983150959014893), ('27', 0.9981359839439392), ('87', 0.9977405071258545), ('34', 0.9976025819778442)]


In [7]:
print(data_limited[50])

scigirl produce twin_cities public television new pbs half hour television series accompany web outreach activity target girl age year old intended impact foster great interest stem girl age parent girl diverse community high priority connect girl exist quality stem education opportunity community contribute grow body research deepen understanding effective way engage girl stem activity encourage pursue stem career projects strategic partner national girls collaborative project ngcp seattle franklin institute philadelphia ngcp link scigirls_tv network community base science program girl franklin institute help form affiliate network science museum implement scigirls_tv outreach activity addition broadcast pbs plus video distribute online streaming dvds version download portable video player significant web component social networking feature allow girl interested science connect nation barbara flagg multimedia research conduct formative evaluation rough cut subsequent summative_evaluat

In [6]:
print(data_limited[0])

project explore game base metaphor enhanced game design game method apply cognitive science metaphor theory design computer mediate learning environment process use structure mapping theory design videogame world align science concept rigorous specification procedure map relational structure targeted concept game world game design translate target concept game system game play game goal relational structure game world design analog targeted conceptual domain player begin construct mental model target concept interactive gameplay make learn concrete embody gameplay experience design guide learner discover relational structure target concept gameplay readiness activity prepare learner subsequent instruction primary objective cyber enable teaching learning game base metaphor enhanced learning objects cygames project empirically test application cognitive science theory aptly design videogame world provide common experience prepare educator learner achieve success cygames employ cognitive 

In [9]:
# apply model and find similarity of test_data to training set

#test_data = word_tokenize("digitalization digitization digital transformation artificial intelligence".lower())

test_data = word_tokenize("research".lower())
similarity_vector = model.infer_vector(test_data)
print("Test similarity vector: ", similarity_vector)

similarity_average = sum(similarity_vector)/len(similarity_vector)
print("Test similarity avg: ", similarity_average)


Test similarity vector:  [ 0.01629844 -0.00892462 -0.01038874  0.01927346  0.05803638  0.02760178
 -0.00152147  0.01636801  0.00529689  0.03317915]
Test similarity avg:  0.015521926782093942


In [None]:
#Doc2vec is an unsupervised learning algorithm to produce vector representations of sentence/paragraph/documents. 
#Doc2vec can represent an entire document with a vector, so we don’t have to take average of word vectors to create document vector

In [None]:
#Variables to look into

# alpha - the initial learning rate
# min_alpha - learning rate will linearly decay to min_alpha as training progresses
        #catastrophic interference - tendency of an artificial neural network to completely and abruptly forget previously learned information upon learning new information
# epoch - number of epoch to train the new document - one Epoch is when the entire dataset is passed forward and backward through the neural network only ONCE.
        # less -> underfitting, too much -> overfitting
    
# how do we examine if the model is sufficiently trained?
        #https://datascience.stackexchange.com/questions/103927/how-to-examine-if-a-doc2vec-model-is-sufficiently-trained


In [None]:
# Doc2Vec white paper
    #https://cs.stanford.edu/~quocle/paragraph_vector.pdf
    
# Code references 
    #https://medium.com/red-buffer/doc2vec-computing-similarity-between-the-documents-47daf6c828cd
    
# Literature references 
    #https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630
    #https://towardsdatascience.com/nlp-101-word2vec-skip-gram-and-cbow-93512ee24314
    #https://towardsdatascience.com/nlp-embedding-techniques-51b7e6ec9f92
    #https://github.com/v1shwa/document-similarity