# Introduction


In this notebook, I will be exploring, understanding different algorithms, methods to rank top 5 similiar documents based on supplied corpus. Due to resourcing, I will applying transfer learning with static word embeddings to generate similarity scores for the provided corpus.

## Setup


In [None]:
#@title
#Install required mods - suppress outputs
!pip install contractions &> /dev/null
!pip install big-O-calculator &> /dev/null


Collecting big-O-calculator
  Downloading https://files.pythonhosted.org/packages/ec/cb/d04c318a65a4cc9b9a88d64d7c1ee482894b15b91f134c3cda295ad72060/big_O_calculator-0.0.9.8.4-py3-none-any.whl
Installing collected packages: big-O-calculator
Successfully installed big-O-calculator-0.0.9.8.4


## Imports

In [None]:
#@title
try:
    import pandas as pd
    import numpy as np
    import sys, os
    import string
    import re, string, unicodedata
    import contractions
    import operator 
    import re
    import multiprocessing
    from absl import logging
    import uuid
    import collections
    from bigO import BigO
    from random import randint
    #sklearn
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    #spacy
    import spacy 

    #nltk
    import nltk

    #nltk downloads
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize

    from nltk.tokenize import RegexpTokenizer
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer

    #stopwords
    english_stop_words = set(stopwords.words('english'))
    
    #gensim
    import gensim
    from gensim.models import Word2Vec
    from gensim.scripts.glove2word2vec import glove2word2vec
    from gensim.models import FastText
    from gensim.models.phrases import Phrases, Phraser
    from matplotlib import pyplot
    from gensim.models import KeyedVectors
    from gensim.similarities import WmdSimilarity
    import gensim.downloader as api
    from gensim.test.utils import datapath

    #loss callbacks
    from gensim.models.callbacks import CallbackAny2Vec

    #Tensorflow
    import tensorflow as tf

    import tensorflow_hub as hub
   
    
except Exception as e:
    print("Some Modules are Missing {}".format(e))
    
pd.set_option('display.max_columns', None)  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Paths

In [None]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Data path
data = '/content/drive/MyDrive/wattpad_test/text/corpus.tsv'

In [None]:
#store embedding location for transfer learning
WORD2VEC_EMBEDDINGS = '/content/drive/MyDrive/wattpad_test/root/input/GoogleNews-vectors-negative300.bin.gz'
GLOVE_EMBEDDINGS =  '/content/drive/MyDrive/wattpad_test/root/input/glove2vec.txt'
FAST_EMBEDDINGS =  '/content/drive/MyDrive/wattpad_test/root/input/crawl-300d-2M.vec'

## Load

In [None]:
text_df = pd.read_csv(data, sep='\t', names=['DocID', 'DocText'])
len(text_df)

998

In [None]:
#check duplicates
dups = text_df.pivot_table(index = ['DocID'], aggfunc ='size') 
#one id is duplicated which will impact recommendation aglorithm later one - rencode

In [None]:
#check duplicates
dups = text_df.pivot_table(index = ['DocID'], aggfunc ='size') 
#one id is duplicated which will impact recommendation aglorithm later one - rencode

In [None]:
#rencode ids to ensure unique
text_df['ID'] = [uuid.uuid4() for _ in range(len(text_df.index))]


In [None]:
#Function to go through our text and get word frequencies
def word_frequencies(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocabulary = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocabulary[word] += 1
            except KeyError:
                vocabulary[word] = 1
    return vocabulary

# Text Preprocessing

This section outlines set of steps and code for the preprocessing process.

In [None]:
#Define a set of cleaning methods to preprocess text
#Initialize wordnet
lemma = WordNetLemmatizer()

In [None]:
# Preprocess functions for text
def replace_contractions(text):
    return contractions.fix(text)

def noisey_strings(token):
    return token not in english_stop_words and token not in list(string.punctuation)  and len(token)>2   

def preprocess(text):
    
    #Run contractions method
    text = replace_contractions(text)
    text = re.sub('\_', ' ', text)
    text = re.sub('\s+', ' ', text)  # remove newline chars
    text = re.sub("\'", "", text)  # remove single quotes
    text = re.sub('[0-9]+', '', text)
    text = re.sub("(\\d|\\W)+"," ", text)
    
    #Normalize data - only use lemmatization to avoid misrepresentation
    text = [lemma.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if noisey_strings(word)]
    
    #Remove noisy strings
    text = [word for word in text if noisey_strings(word)]
    
    return " ".join(text)

In [None]:
%%time

text_df['DocText_Clean'] = text_df.DocText.apply(func = preprocess)

CPU times: user 15.2 s, sys: 125 ms, total: 15.3 s
Wall time: 15.3 s


In [None]:
# Store our vocabulary frequencies for quick review
vocab = word_frequencies(text_df['DocText_Clean'])


In [None]:
text = list(text_df['DocText_Clean'])

In [None]:
#Some extra cleaning using spacy nlp pipe, for potentially missed normalizations.
def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
%%time
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

text = [cleaning(doc) for doc in nlp.pipe(text, batch_size=1000, n_threads=-1)]

CPU times: user 33.2 s, sys: 1.65 s, total: 34.8 s
Wall time: 35 s


In [None]:
df_cleaned =  pd.DataFrame(text, columns=['DocText_Cleaner'])
text_df = text_df.join(df_cleaned)

In [None]:
text_df

Unnamed: 0,DocID,DocText,ID,DocText_Clean,DocText_Cleaner
0,90041,I hear it all the time boo! Too bad I ain't go...,719f7906-f626-45f7-a0cf-f51c5f946e7d,hear time boo bad get patience anyone elses pr...,hear time boo bad patience else problem laugh ...
1,48027,"the dirt after the rain. soaking the hot, parc...",49ec299f-8951-4b96-bc73-b24149d8c89c,dirt rain soak hot parch earth curl bed warm f...,dirt rain soak hot parch earth curl bed warm f...
2,44081,The night before the wedding. A light tear tri...,df4ba250-71f8-4bf9-bbdb-205d6c01f6ff,night wed light tear trickle beautiful bold br...,night would light tear trickle beautiful bold ...
3,54461,"I caught up with Lisa quickly and grabbed her,...",28317d65-dca4-4966-8b48-2e762e3e25bf,catch lisa quickly grab throw shoulder thrash ...,catch lisa quickly grab throw shoulder thrash ...
4,37461,"And, he said as though he had not been interru...",b6874c4b-ef5e-46af-836e-a7285a675397,say though interrupt become friends dumbledore...,interrupt friend dumbledore good guy thank tha...
...,...,...,...,...,...
993,42620,"Uhmm... a day or two. I told them, my mom had ...",0170e266-8320-4629-a5fc-f373a767f0dd,uhmm day two tell mom make buy food honestly k...,uhmm day tell mom buy food honestly know hungr...
994,62987,Chapter 10The harbor was full of thousands of ...,67c509f5-e18d-41ad-b40d-9e24a61b2c44,chapter harbor full thousands people everyone ...,chapter harbor thousand people struggle push h...
995,85335,"Jason's POV""Hello.I'm Blackrose."" Said the lit...",28fa35af-193c-4539-84cc-699b00a284ac,jasons pov hello blackrose say little tiny imp...,jason pov hello blackrose little tiny imp dwar...
996,40867,xx Maggie Who loves 'Saved by the Bell'? I do!...,a95b9f5d-aae7-4361-92d8-d68f1c2893ca,maggie love save bell yes really short prologu...,maggie love save bell yes short prologue try p...


## Building the processed corpus for training

In [None]:
corpus = [row.split() for row in text]


# Baseline Models

#Pretrained Word Embeddings

---


#### Due to resourcing I apply transfer learning. I use and test the world-class pretrained word embeddings and apply each of these to our vocab. Word2Vec, Glove, Fasttext.

In [None]:
#Method to understand the overlap between our doc vocabulary and word embeddings
def check_matches(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

## Setting up our embeddings (Word2Vec, GloVE, FastText)

In [None]:
#initial glove conversions
#GLOVE_EMBEDDINGS = "/content/drive/MyDrive/wattpad_test/root/input/glove2vec.txt"
#glove_word2vec = get_tmpfile("/content/drive/MyDrive/wattpad_test/root/input/glove2vec.txt")
#glove2word2vec(GLOVE_EMBEDDINGS, glove_word2vec)

In [None]:
%%time
#Load up word2vec embeddings
word2vec_embeddings = KeyedVectors.load_word2vec_format(WORD2VEC_EMBEDDINGS, binary=True)

CPU times: user 1min 28s, sys: 2.79 s, total: 1min 31s
Wall time: 1min 34s


In [None]:
%%time
#Load up glove embeddings
glove_embeddings = KeyedVectors.load_word2vec_format(GLOVE_EMBEDDINGS, binary=False)

CPU times: user 1min 34s, sys: 1.04 s, total: 1min 35s
Wall time: 1min 37s


## Model Training - Word2Vec

Begin training model on corpus adding pretrained word2vec embeddings. These are trained on Google News articles.

In [None]:
#Checking coverage to see how overlap of embeddings with our vocab
#evaluate word2vec embeddings coverage
word2vec_coverage = check_matches(vocab, word2vec_embeddings)


Found embeddings for 50.584% of vocab
Found embeddings for  78.747% of all text


In [None]:

# init callback class to review our training.
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [None]:
# Training our corpus with Word2Vec Pretrained Model
#Instantiate the model for training
word2vec_model = Word2Vec(min_count=2,
                     window=5,
                     negative=5,
                     size=300, 
                     workers=cores-1
                     )

In [None]:
word2vec_model.build_vocab(corpus)

In [None]:
word2vec_model.wv.vectors.shape
 

(18199, 300)

In [None]:
%%time
#Add word2vec embeddings for words that overlap in our vocab
word2vec_model.intersect_word2vec_format(WORD2VEC_EMBEDDINGS, lockf=1.0, binary=True)
word2vec_model.train(corpus, 
                     total_examples=word2vec_model.corpus_count,
                     epochs = 8, 
                     report_delay=1,
                     compute_loss = True, # set compute_loss = True
                     callbacks=[callback()])

Loss after epoch 0: 1196835.25
Loss after epoch 1: 992025.5
Loss after epoch 2: 918252.25
Loss after epoch 3: 877033.5
Loss after epoch 4: 780122.5
Loss after epoch 5: 735361.0
Loss after epoch 6: 720342.5
Loss after epoch 7: 710775.0
CPU times: user 1min 22s, sys: 782 ms, total: 1min 23s
Wall time: 1min 23s


In [None]:
#test similiarity of standard word, we can see that semantic similarities make sense.
word2vec_model.wv.most_similar(positive=["brother"])

[('sister', 0.6748436689376831),
 ('nephew', 0.656744122505188),
 ('odd', 0.6496289968490601),
 ('brothers', 0.6443120241165161),
 ('father', 0.6363226175308228),
 ('uncle', 0.6082543730735779),
 ('cousin', 0.6061365008354187),
 ('dean', 0.5971981287002563),
 ('siblings', 0.5868384838104248),
 ('stepfather', 0.5838647484779358)]

In [None]:
#Save the model for later use, disable model callbacks, required due to a bug as result of gensim
word2vec_model.callbacks = ()
word2vec_model.save("/content/drive/MyDrive/wattpad_test/root/models/word2vec/word2vec.model")


## Model Training - GLOVE

I will use the same process, but with Glove embeddings instead. These are based off Common Crawl. I decided to see its worth exploring, because the coverage on the glove embeddings is higher than the pretrained word2vec embeddings.

In [None]:
glove_coverage = check_matches(vocab, glove_embeddings)

Found embeddings for 55.187% of vocab
Found embeddings for  82.463% of all text


In [None]:
# Training our corpus with Word2Vec Pretrained Model
#Instantiate the model for training
glove_model = Word2Vec(min_count=2,
                     window=5,
                     negative=5,
                     size=300, 
                     workers=cores-1
                     )

In [None]:
glove_model.build_vocab(corpus)

In [None]:
glove_model.wv.vectors.shape
 

(18199, 300)

In [None]:
%%time
glove_model.intersect_word2vec_format(GLOVE_EMBEDDINGS, lockf=1.0, binary=False)
glove_model.train(corpus, 
                     total_examples=glove_model.corpus_count,
                     epochs = 5, 
                     report_delay=1,
                     compute_loss = True, # set compute_loss = True
                     callbacks=[callback()])

Loss after epoch 0: 1138595.125
Loss after epoch 1: 958506.125
Loss after epoch 2: 880735.25
Loss after epoch 3: 845243.25
Loss after epoch 4: 778129.25
CPU times: user 1min 27s, sys: 453 ms, total: 1min 28s
Wall time: 1min 28s


In [None]:
#test similiarity of standard word, we can see that semantic similarities make sense.
print('Glove: similiarity test: \n', glove_model.wv.most_similar(positive=["happy"]))
print('Word2Vec: similiarity test \n',word2vec_model.wv.most_similar(positive=["happy"]))

Glove: similiarity test: 
 [('glad', 0.7198635935783386), ('wish', 0.6502501964569092), ('sad', 0.6355036497116089), ('wonderful', 0.6221669316291809), ('well', 0.6024701595306396), ('proud', 0.6011662483215332), ('know', 0.5920550227165222), ('obviously', 0.5870867967605591), ('hope', 0.5853500366210938), ('better', 0.5833157300949097)]
Word2Vec: similiarity test 
 [('glad', 0.7298558950424194), ('thankful', 0.6566928625106812), ('proud', 0.6545039415359497), ('sad', 0.6405991911888123), ('wonderful', 0.6126113533973694), ('wish', 0.6124885082244873), ('happier', 0.5944545269012451), ('grateful', 0.5809042453765869), ('better', 0.5791413187980652), ('confident', 0.5756374597549438)]


In [None]:
#Save the model for later use, disable model callbacks, required due to a bug as result of gensim
glove_model.callbacks = ()

glove_model.save("/content/drive/MyDrive/wattpad_test/root/models/glove/glove.model")


## Model Training - FastText

In [None]:
fast_embeddings = KeyedVectors.load_word2vec_format(FAST_EMBEDDINGS, binary=False)


In [None]:
%%time
fast_coverage = check_matches(vocab, fast_embeddings)

Found embeddings for 64.545% of vocab
Found embeddings for  88.891% of all text
CPU times: user 119 ms, sys: 994 µs, total: 120 ms
Wall time: 122 ms


In [None]:

fast_model = Word2Vec(min_count=2,
                     window=5,
                     negative=5,
                     size=300, 
                     workers=cores-1
                     )

In [None]:
fast_model.build_vocab(corpus)

In [None]:
%%time
fast_model.intersect_word2vec_format(FAST_EMBEDDINGS, lockf=1.0, binary=False)
fast_model.train(corpus, 
                     total_examples=fast_model.corpus_count,
                     epochs = 5, 
                     report_delay=1,
                     compute_loss = True, # set compute_loss = True
                     callbacks=[callback()])

Loss after epoch 0: 1138986.5
Loss after epoch 1: 959151.5
Loss after epoch 2: 890129.0
Loss after epoch 3: 857594.25
Loss after epoch 4: 781818.25
CPU times: user 6min 42s, sys: 1.78 s, total: 6min 44s
Wall time: 6min 44s


In [None]:
fast_model.wv.most_similar(positive=["car"])

[('drive', 0.7881123423576355),
 ('driveway', 0.6953915357589722),
 ('truck', 0.6896342635154724),
 ('park', 0.6881539225578308),
 ('driver', 0.6609848737716675),
 ('road', 0.6435428261756897),
 ('house', 0.6325049996376038),
 ('vehicle', 0.6306074857711792),
 ('ride', 0.6245632171630859),
 ('cars', 0.6129789352416992)]

In [None]:
#test similiarity of standard word, we can see that semantic similarities make sense.
fast_model.callbacks = ()
fast_model.save("/content/drive/MyDrive/wattpad_test/root/models/fasttext/fast.model")

# TF-IDF Vectorization
Here we will use a TF-IDF weighting strategy with our Word Embeddings to obtain an overall document embedding.
Word Embeddings assume equal weighting across words which really doesn't necessarily apply to groups of words, sentences, documents. An alternative that could be used is average embedding for span of the description. (total sum of all vectors divided by the total number of words)

Kenter et al. 
https://www.aclweb.org/anthology/P16-1089.pdf

In [None]:
#Creating the term frequency model
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df = 5, stop_words='english')
tfidf.fit(text_df['DocText_Cleaner'])

# Getting the words from the TF-IDF model
tfidf_list = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))
tfidf_feature = tfidf.get_feature_names() # tfidf words/col-names


In [None]:
# Building TF-IDF based on embeddings 

def generate_vectors(corpus, model):
# Storing the TFIDF Word2Vec embeddings
  tfidf_vectors = []; 
  line = 0;

  for doc_text in corpus: 
    # Word vectors are of zero length (Used 300 dimensions)
      text_vec = np.zeros(300) 

      # num of words with a valid vector in the text
      weight_sum=0; 

      for word in doc_text: 
          if word in model.wv.vocab and word in tfidf_feature:
    
              vec = model.wv[word]
              tf_idf = tfidf_list[word] * (doc_text.count(word) / len(doc_text))
              text_vec += (vec * tf_idf)
              weight_sum += tf_idf
    
      if weight_sum != 0:
          text_vec /= weight_sum
      tfidf_vectors.append(text_vec)
      line += 1

  return tfidf_vectors

In [None]:
#Method to get top N similiar documents, 

def similiarity_query(df, DocID, input_vectors):
    
    # finding cosine similarity for the vectors
    cosine_similarities = cosine_similarity(input_vectors,  input_vectors)
    
    docs = df[['DocID', 'DocText']]    
    #Reverse mapping of the index
    indices = pd.Series(df.index, index = df['DocID']).drop_duplicates()
         
    idx = indices[DocID]
    
    scores = list(enumerate(cosine_similarities[idx]))
    scores = sorted(scores, key = lambda x: x[1], reverse = True)
    scores = scores[0:6]
    
    doc_indices = [i[0] for i in scores]
    recommend = docs.iloc[doc_indices]
    doc_id = (docs['DocID']==DocID)
    doc_item = docs[docs['DocID']==DocID]
    print(scores)

    return recommend


## Generating the Weighted Vectors

In [None]:
%%time
#tfidf using word2vec model
tfidf_w2v_vectors = generate_vectors(corpus, word2vec_model)

CPU times: user 6min 7s, sys: 3.07 s, total: 6min 10s
Wall time: 6min 7s


In [None]:
%%time
#tfidf using glove model
tfidf_glove_vectors = generate_vectors(corpus, glove_model)

CPU times: user 6min 7s, sys: 3.42 s, total: 6min 11s
Wall time: 6min 7s


In [None]:
%%time
#tfidf using glove model
tfidf_fast_vectors = generate_vectors(corpus, fast_model)

CPU times: user 6min 5s, sys: 2.75 s, total: 6min 8s
Wall time: 6min 5s


# Testing the Model and Similiarity results

In [None]:
word2vec_output = similiarity_query(text_df, 54461, tfidf_w2v_vectors)

[(3, 1.0), (102, 0.9559888092956998), (551, 0.9472381567610402), (632, 0.9454229846007264), (258, 0.9449445232904443), (584, 0.9416076677067808)]


In [None]:
glove_output = similiarity_query(text_df, 54461, tfidf_glove_vectors)

[(3, 1.0000000000000002), (102, 0.9620258730149605), (551, 0.9517544992492468), (632, 0.9451795644728527), (258, 0.9447027978867921), (584, 0.9442718597854415)]


In [None]:
fast_output = similiarity_query(text_df, 54461, tfidf_fast_vectors)

[(3, 0.9999999999999996), (102, 0.9661098062422976), (632, 0.9592715989655782), (551, 0.9583605415586101), (584, 0.9582858249963002), (258, 0.9568938523646581)]


In [None]:
pd.set_option('display.max_columns', None)

# Evaluation


Although these models are an unsupervised learning task, I use spearman coefficient on google test set to evaluate models effectiveness in semantic similiarity. This may not necessarily be best indicator given successful evaluation results depend on the specific use case.

In [None]:
#Evaluation of wordpairs

In [None]:
print('Word2vec model: ', word2vec_model.wv.evaluate_word_pairs(datapath("wordsim353.tsv")))
print('Glove model: ', glove_model.wv.evaluate_word_pairs(datapath("wordsim353.tsv")))
print('Fast model: ', fast_model.wv.evaluate_word_pairs(datapath("wordsim353.tsv")))

Word2vec model:  ((0.6303393065306551, 2.558761736650827e-28), SpearmanrResult(correlation=0.6523540703811658, pvalue=7.622006254689518e-31), 31.1614730878187)
Glove model:  ((0.6865899641258696, 3.2214578447819863e-35), SpearmanrResult(correlation=0.6894521597856433, pvalue=1.3028444500266424e-35), 31.1614730878187)
Fast model:  ((0.7211596298059225, 2.7078224312627366e-40), SpearmanrResult(correlation=0.7481017318057481, pvalue=8.166714588946754e-45), 31.1614730878187)


In [None]:
glove_model.wv.evaluate_word_pairs(datapath("wordsim353.tsv"))


((0.6865899641258696, 3.2214578447819863e-35),
 SpearmanrResult(correlation=0.6894521597856433, pvalue=1.3028444500266424e-35),
 31.1614730878187)

In [None]:
fast_model.wv.evaluate_word_pairs(datapath("wordsim353.tsv"))


((0.7211596298059225, 2.7078224312627366e-40),
 SpearmanrResult(correlation=0.7481017318057481, pvalue=8.166714588946754e-45),
 31.1614730878187)

# Review of Document Similiarities

In [None]:
word2vec_output

Unnamed: 0,DocID,DocText
3,54461,"I caught up with Lisa quickly and grabbed her, throwing her over my shoulder while she thrashed and screamed, laughing her ass off. Hannah was taunting me getting teasingly close so I snapped my hand out and clamped it around her wrist, making her squeal. I carried Lisa and dragged Hannah into the water and then threw Lisa in, laughing as she coughed and spluttered, wiping water out of her face. She grinned at me and grabbed my hand pulling me into the water too and before I knew what was happening the other two girls jumped on me too. I heard my mom shout back, still laughing. “You girls knew this would happen, you deal with the consequences!” I burst out laughing. Well I guess that’s what I get for falling asleep in front of three Peters girls, I should have known better! “You’d better run!” I shouted, streaking after one of them at random. It turned out to be Lisa so I smiled and pushed myself after her, making her scream at my mom for help, “Mom! Make him leave me alone!” The..."
102,59484,"I'm going to teach you some self defense. So get you ass ready right now so we can leave. I laughed and pulled on some shorts and a tank top. When we finally laughed I looked up at him. He actually wasn't that bad looking. He had dark hair and dark eyes. He was very handsome and strong to. I ruffled my hair again looking in the mirror. My blond hair was something else. I didn't have to do much in order to control it. But what can I do, I was gorgeous. ""Where shall we go to eat?"" I ignored him and he drove off. ""So tell me about yourself,"" Dimitri said as I took a gulp of water down. ""I'm hungry."" I cried. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ twenty Minutes later ""Kassidy turn the light off."" I pulled the covers over my head only to have them pulled away from. I opened my eyes in surprised and saw Dimitri. I sighed and stood up letting him get a good look. I knew I was in my bra and thong but it wasn't my fault he was so fucking evil to pu..."
551,51214,"Well I know alot about you I say back Because uh.. I love you guys I say shyly. Aww don't be shy love I think your pretty he says winking at me. I wake up and I'm still wrapped in his arms, warm and safe. She went to get the casting stuff and I'm sitting her staring at the wall. The door creeks and I shoot up and roll off the bed and hide behind it. I see a dust pan and grab it. The person walks in and I charge throwing the dust pan at the person and hitting them right in the face. I keep throwing stuff. Ow! Quinn stop! It's me Zayn! He yells. Zayn? I say questioning him. Yes, Zayn he says. I stop throwing stuff and fall to the floor and start crying. Here let me get it he says. He leans in and wipes it off. He keeps his hand on my cheek and stares into my eyes. He closes his eyes and leans in. OMG he's going to kiss me! I feel butterflies in my stomach I close my eyes and are lips met and my stomach does flips. I finished the bar and it felt good. Uh Lanae you have a little ..."
632,82059,"It's about Alize. ********* Joey's P. O.V I laughed as Alize said a very stupid joke. She smiled. I love that smile. She makes me feel all tingly inside. That's not weird right? ""Joey?"" Alize says looking straight into my eyes. My laughter died down. ""Yeah?"" I say, still looking into hazel brown eyes. ""I-"" someone knocked on the door. ""I'll go get it."" She said. I nod, upset that we were interrupted. A few minutes later I hear Alize and someone else. A guy. I don't know who though. Although he looks oddly familiar. ""Joey this is Tyler."" She said a little too happy. For some odd reason i got really angry and protective. ""Oh yeah I think i remember you."" I say casually. ""Wait Tyler how did you get my address?"" Alize asks, totally out of it. ""Oh, that's right, when you have it to me you were totally hammered."" Tyler says chuckling a little. Alize blushes. "" oh right."" I smiled ""Thank you, for understanding."" ""Don't mention it."" I sat down on a stool at our small ..."
258,69856,"Hey! Jake said, trying to pull my attention away from my phone, with his hand under my chin, i looked up into his swirly colored eyes. ""that mouth is too pretty for dirty words"" he leaned down winked and pecked my lips. My cheeks becoming rosy red. "" seriosly what's wrong?"" he continued, looking towards my phone. ""ooooh that's never good, come on, let's get you home."" he said, pulling me towards his motorcycle, pulling me onto his back. "" hey! what are you doing, I have legs you know?"" I protested against him picking me up. "" We can go faster this way."" he replied like I was a 5 year old who can't walk as fast. "" It's gonna take the same amount of time for me to walk with you anywa-"" ""Common"" he turned his head so I could see him, and jetted out his lip. "" FFINNEEE"" I dragged out not being able to say no to his puppy dog face. ""mmm you smell good"" I giggled, loving how close we were. "" That's what they all say."" he breathed out, sounding tired "" Do you want me to hop off, you soun..."
584,64338,"Okay. she smiled and I set her down. She looked back up at me. ""Tom will you play dollys with me when you're done?"" she asked. ""Can I aske what you're doing here? I thought I made it clear over the phone that our fling was over."" ""I know what you said but I wanted to tell you in person. I meant everything I said. I know that you don't want a relationship that's long distance but joules i'm serious. I can't lose you in my life Joules."" ""Tom. I don't know what to say."" she stuttered. ""Just say you'll be my friend. Say that you'll give me a chance."" ""I will if you swear on your Fathers grave that you did not cheat on me."" I know swearing on his grave is an awful thing to do. But I did not cheat on her. ""I swear."" I spoke. ""Alright. We can see how this goes. Now would you like to come in?"" She asked. ""I'd love to."" I smiled. ~~Michaels POV~~ As much as I was loving being grilled by Ryan nonstop about liking Joules I chose, instead to ditch him in California (not that he was upset. He w..."


In [None]:
glove_output

Unnamed: 0,DocID,DocText
3,54461,"I caught up with Lisa quickly and grabbed her, throwing her over my shoulder while she thrashed and screamed, laughing her ass off. Hannah was taunting me getting teasingly close so I snapped my hand out and clamped it around her wrist, making her squeal. I carried Lisa and dragged Hannah into the water and then threw Lisa in, laughing as she coughed and spluttered, wiping water out of her face. She grinned at me and grabbed my hand pulling me into the water too and before I knew what was happening the other two girls jumped on me too. I heard my mom shout back, still laughing. “You girls knew this would happen, you deal with the consequences!” I burst out laughing. Well I guess that’s what I get for falling asleep in front of three Peters girls, I should have known better! “You’d better run!” I shouted, streaking after one of them at random. It turned out to be Lisa so I smiled and pushed myself after her, making her scream at my mom for help, “Mom! Make him leave me alone!” The..."
102,59484,"I'm going to teach you some self defense. So get you ass ready right now so we can leave. I laughed and pulled on some shorts and a tank top. When we finally laughed I looked up at him. He actually wasn't that bad looking. He had dark hair and dark eyes. He was very handsome and strong to. I ruffled my hair again looking in the mirror. My blond hair was something else. I didn't have to do much in order to control it. But what can I do, I was gorgeous. ""Where shall we go to eat?"" I ignored him and he drove off. ""So tell me about yourself,"" Dimitri said as I took a gulp of water down. ""I'm hungry."" I cried. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ twenty Minutes later ""Kassidy turn the light off."" I pulled the covers over my head only to have them pulled away from. I opened my eyes in surprised and saw Dimitri. I sighed and stood up letting him get a good look. I knew I was in my bra and thong but it wasn't my fault he was so fucking evil to pu..."
551,51214,"Well I know alot about you I say back Because uh.. I love you guys I say shyly. Aww don't be shy love I think your pretty he says winking at me. I wake up and I'm still wrapped in his arms, warm and safe. She went to get the casting stuff and I'm sitting her staring at the wall. The door creeks and I shoot up and roll off the bed and hide behind it. I see a dust pan and grab it. The person walks in and I charge throwing the dust pan at the person and hitting them right in the face. I keep throwing stuff. Ow! Quinn stop! It's me Zayn! He yells. Zayn? I say questioning him. Yes, Zayn he says. I stop throwing stuff and fall to the floor and start crying. Here let me get it he says. He leans in and wipes it off. He keeps his hand on my cheek and stares into my eyes. He closes his eyes and leans in. OMG he's going to kiss me! I feel butterflies in my stomach I close my eyes and are lips met and my stomach does flips. I finished the bar and it felt good. Uh Lanae you have a little ..."
632,82059,"It's about Alize. ********* Joey's P. O.V I laughed as Alize said a very stupid joke. She smiled. I love that smile. She makes me feel all tingly inside. That's not weird right? ""Joey?"" Alize says looking straight into my eyes. My laughter died down. ""Yeah?"" I say, still looking into hazel brown eyes. ""I-"" someone knocked on the door. ""I'll go get it."" She said. I nod, upset that we were interrupted. A few minutes later I hear Alize and someone else. A guy. I don't know who though. Although he looks oddly familiar. ""Joey this is Tyler."" She said a little too happy. For some odd reason i got really angry and protective. ""Oh yeah I think i remember you."" I say casually. ""Wait Tyler how did you get my address?"" Alize asks, totally out of it. ""Oh, that's right, when you have it to me you were totally hammered."" Tyler says chuckling a little. Alize blushes. "" oh right."" I smiled ""Thank you, for understanding."" ""Don't mention it."" I sat down on a stool at our small ..."
258,69856,"Hey! Jake said, trying to pull my attention away from my phone, with his hand under my chin, i looked up into his swirly colored eyes. ""that mouth is too pretty for dirty words"" he leaned down winked and pecked my lips. My cheeks becoming rosy red. "" seriosly what's wrong?"" he continued, looking towards my phone. ""ooooh that's never good, come on, let's get you home."" he said, pulling me towards his motorcycle, pulling me onto his back. "" hey! what are you doing, I have legs you know?"" I protested against him picking me up. "" We can go faster this way."" he replied like I was a 5 year old who can't walk as fast. "" It's gonna take the same amount of time for me to walk with you anywa-"" ""Common"" he turned his head so I could see him, and jetted out his lip. "" FFINNEEE"" I dragged out not being able to say no to his puppy dog face. ""mmm you smell good"" I giggled, loving how close we were. "" That's what they all say."" he breathed out, sounding tired "" Do you want me to hop off, you soun..."
584,64338,"Okay. she smiled and I set her down. She looked back up at me. ""Tom will you play dollys with me when you're done?"" she asked. ""Can I aske what you're doing here? I thought I made it clear over the phone that our fling was over."" ""I know what you said but I wanted to tell you in person. I meant everything I said. I know that you don't want a relationship that's long distance but joules i'm serious. I can't lose you in my life Joules."" ""Tom. I don't know what to say."" she stuttered. ""Just say you'll be my friend. Say that you'll give me a chance."" ""I will if you swear on your Fathers grave that you did not cheat on me."" I know swearing on his grave is an awful thing to do. But I did not cheat on her. ""I swear."" I spoke. ""Alright. We can see how this goes. Now would you like to come in?"" She asked. ""I'd love to."" I smiled. ~~Michaels POV~~ As much as I was loving being grilled by Ryan nonstop about liking Joules I chose, instead to ditch him in California (not that he was upset. He w..."


In [None]:
fast_output

Unnamed: 0,DocID,DocText
3,54461,"I caught up with Lisa quickly and grabbed her, throwing her over my shoulder while she thrashed and screamed, laughing her ass off. Hannah was taunting me getting teasingly close so I snapped my hand out and clamped it around her wrist, making her squeal. I carried Lisa and dragged Hannah into the water and then threw Lisa in, laughing as she coughed and spluttered, wiping water out of her face. She grinned at me and grabbed my hand pulling me into the water too and before I knew what was happening the other two girls jumped on me too. I heard my mom shout back, still laughing. “You girls knew this would happen, you deal with the consequences!” I burst out laughing. Well I guess that’s what I get for falling asleep in front of three Peters girls, I should have known better! “You’d better run!” I shouted, streaking after one of them at random. It turned out to be Lisa so I smiled and pushed myself after her, making her scream at my mom for help, “Mom! Make him leave me alone!” The..."
102,59484,"I'm going to teach you some self defense. So get you ass ready right now so we can leave. I laughed and pulled on some shorts and a tank top. When we finally laughed I looked up at him. He actually wasn't that bad looking. He had dark hair and dark eyes. He was very handsome and strong to. I ruffled my hair again looking in the mirror. My blond hair was something else. I didn't have to do much in order to control it. But what can I do, I was gorgeous. ""Where shall we go to eat?"" I ignored him and he drove off. ""So tell me about yourself,"" Dimitri said as I took a gulp of water down. ""I'm hungry."" I cried. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ twenty Minutes later ""Kassidy turn the light off."" I pulled the covers over my head only to have them pulled away from. I opened my eyes in surprised and saw Dimitri. I sighed and stood up letting him get a good look. I knew I was in my bra and thong but it wasn't my fault he was so fucking evil to pu..."
632,82059,"It's about Alize. ********* Joey's P. O.V I laughed as Alize said a very stupid joke. She smiled. I love that smile. She makes me feel all tingly inside. That's not weird right? ""Joey?"" Alize says looking straight into my eyes. My laughter died down. ""Yeah?"" I say, still looking into hazel brown eyes. ""I-"" someone knocked on the door. ""I'll go get it."" She said. I nod, upset that we were interrupted. A few minutes later I hear Alize and someone else. A guy. I don't know who though. Although he looks oddly familiar. ""Joey this is Tyler."" She said a little too happy. For some odd reason i got really angry and protective. ""Oh yeah I think i remember you."" I say casually. ""Wait Tyler how did you get my address?"" Alize asks, totally out of it. ""Oh, that's right, when you have it to me you were totally hammered."" Tyler says chuckling a little. Alize blushes. "" oh right."" I smiled ""Thank you, for understanding."" ""Don't mention it."" I sat down on a stool at our small ..."
551,51214,"Well I know alot about you I say back Because uh.. I love you guys I say shyly. Aww don't be shy love I think your pretty he says winking at me. I wake up and I'm still wrapped in his arms, warm and safe. She went to get the casting stuff and I'm sitting her staring at the wall. The door creeks and I shoot up and roll off the bed and hide behind it. I see a dust pan and grab it. The person walks in and I charge throwing the dust pan at the person and hitting them right in the face. I keep throwing stuff. Ow! Quinn stop! It's me Zayn! He yells. Zayn? I say questioning him. Yes, Zayn he says. I stop throwing stuff and fall to the floor and start crying. Here let me get it he says. He leans in and wipes it off. He keeps his hand on my cheek and stares into my eyes. He closes his eyes and leans in. OMG he's going to kiss me! I feel butterflies in my stomach I close my eyes and are lips met and my stomach does flips. I finished the bar and it felt good. Uh Lanae you have a little ..."
584,64338,"Okay. she smiled and I set her down. She looked back up at me. ""Tom will you play dollys with me when you're done?"" she asked. ""Can I aske what you're doing here? I thought I made it clear over the phone that our fling was over."" ""I know what you said but I wanted to tell you in person. I meant everything I said. I know that you don't want a relationship that's long distance but joules i'm serious. I can't lose you in my life Joules."" ""Tom. I don't know what to say."" she stuttered. ""Just say you'll be my friend. Say that you'll give me a chance."" ""I will if you swear on your Fathers grave that you did not cheat on me."" I know swearing on his grave is an awful thing to do. But I did not cheat on her. ""I swear."" I spoke. ""Alright. We can see how this goes. Now would you like to come in?"" She asked. ""I'd love to."" I smiled. ~~Michaels POV~~ As much as I was loving being grilled by Ryan nonstop about liking Joules I chose, instead to ditch him in California (not that he was upset. He w..."
258,69856,"Hey! Jake said, trying to pull my attention away from my phone, with his hand under my chin, i looked up into his swirly colored eyes. ""that mouth is too pretty for dirty words"" he leaned down winked and pecked my lips. My cheeks becoming rosy red. "" seriosly what's wrong?"" he continued, looking towards my phone. ""ooooh that's never good, come on, let's get you home."" he said, pulling me towards his motorcycle, pulling me onto his back. "" hey! what are you doing, I have legs you know?"" I protested against him picking me up. "" We can go faster this way."" he replied like I was a 5 year old who can't walk as fast. "" It's gonna take the same amount of time for me to walk with you anywa-"" ""Common"" he turned his head so I could see him, and jetted out his lip. "" FFINNEEE"" I dragged out not being able to say no to his puppy dog face. ""mmm you smell good"" I giggled, loving how close we were. "" That's what they all say."" he breathed out, sounding tired "" Do you want me to hop off, you soun..."


# DOC2VEC

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))        

In [None]:
text_df

Unnamed: 0,DocID,DocText,ID,DocText_Clean,DocText_Cleaner
0,90041,I hear it all the time boo! Too bad I ain't go...,719f7906-f626-45f7-a0cf-f51c5f946e7d,hear time boo bad get patience anyone elses pr...,hear time boo bad patience else problem laugh ...
1,48027,"the dirt after the rain. soaking the hot, parc...",49ec299f-8951-4b96-bc73-b24149d8c89c,dirt rain soak hot parch earth curl bed warm f...,dirt rain soak hot parch earth curl bed warm f...
2,44081,The night before the wedding. A light tear tri...,df4ba250-71f8-4bf9-bbdb-205d6c01f6ff,night wed light tear trickle beautiful bold br...,night would light tear trickle beautiful bold ...
3,54461,"I caught up with Lisa quickly and grabbed her,...",28317d65-dca4-4966-8b48-2e762e3e25bf,catch lisa quickly grab throw shoulder thrash ...,catch lisa quickly grab throw shoulder thrash ...
4,37461,"And, he said as though he had not been interru...",b6874c4b-ef5e-46af-836e-a7285a675397,say though interrupt become friends dumbledore...,interrupt friend dumbledore good guy thank tha...
...,...,...,...,...,...
993,42620,"Uhmm... a day or two. I told them, my mom had ...",0170e266-8320-4629-a5fc-f373a767f0dd,uhmm day two tell mom make buy food honestly k...,uhmm day tell mom buy food honestly know hungr...
994,62987,Chapter 10The harbor was full of thousands of ...,67c509f5-e18d-41ad-b40d-9e24a61b2c44,chapter harbor full thousands people everyone ...,chapter harbor thousand people struggle push h...
995,85335,"Jason's POV""Hello.I'm Blackrose."" Said the lit...",28fa35af-193c-4539-84cc-699b00a284ac,jasons pov hello blackrose say little tiny imp...,jason pov hello blackrose little tiny imp dwar...
996,40867,xx Maggie Who loves 'Saved by the Bell'? I do!...,a95b9f5d-aae7-4361-92d8-d68f1c2893ca,maggie love save bell yes really short prologu...,maggie love save bell yes short prologue try p...


In [None]:
#created a nested list of DocID and DocText

nested_docs = text_df.reset_index()[['DocID','DocText_Cleaner']].values.tolist()

In [None]:
tagged_docs = []
for row in nested_docs:
  item = TaggedDocument(words=row[1].split(),tags=[row[0]])
  tagged_docs.append(item)

In [None]:
%%time
#Setting up the Doc2vec model and training parameters
doc2vec_model = Doc2Vec(tagged_docs, 
                        vector_size=300, 
                        window=1, 
                        min_count=10, 
                        workers=4, 
                        dm=1, 
                        sample= 0.01, 
                        dm_concat = 1, 
                        negative = 5, 
                        dbow_words = 1, 
                        alpha=0.025, 
                        min_alpha=0.02)

CPU times: user 28.6 s, sys: 147 ms, total: 28.7 s
Wall time: 23.4 s


In [None]:
#saving the model for future use.
doc2vec_model.save('/content/drive/MyDrive/wattpad_test/root/models/doc2vec/doc2vec.model')

In [None]:
#testing word similiarity
doc2vec_model.wv.most_similar('hello')

[('hey', 0.6708171367645264),
 ('holly', 0.6362658739089966),
 ('beti', 0.6282057762145996),
 ('dryly', 0.6237810850143433),
 ('mrs', 0.6122820377349854),
 ('bye', 0.6112575531005859),
 ('gezel', 0.6052325963973999),
 ('drake', 0.599120557308197),
 ('zack', 0.596785306930542),
 ('jennifer', 0.5955810546875)]

## Similiarity Query



In [None]:
def doc_tokens(DocID, df):
  #retrieve stored value
  query_string = df[df['DocID']==DocID]['DocText_Cleaner'].item()
  query_tokens = query_string.split()
  print(query_tokens)
  return query_tokens

In [None]:
def similiarity_query(query, model):
  query_vector = model.infer_vector(query)
  sim_results = model.docvecs.most_similar([query_vector], topn=6)
  return sim_results

In [None]:
query = doc_tokens(54461, text_df)



In [None]:
results = similiarity_query(query,doc2vec_model)
results

[(54461, 0.7882322669029236),
 (44831, 0.49452969431877136),
 (32602, 0.4581599533557892),
 (17603, 0.45739981532096863),
 (6384, 0.45655691623687744),
 (36038, 0.4527413249015808)]

In [None]:
%unload_ext google.colab.data_table
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 5000)

x = pd.DataFrame(text_df['DocText'].loc[text_df.DocID.isin(['54461','44831','32602','17603','6384','36038'])])
x

The google.colab.data_table extension is not loaded.


Unnamed: 0,DocText
3,"I caught up with Lisa quickly and grabbed her, throwing her over my shoulder while she thrashed and screamed, laughing her ass off. Hannah was taunting me getting teasingly close so I snapped my hand out and clamped it around her wrist, making her squeal. I carried Lisa and dragged Hannah into the water and then threw Lisa in, laughing as she coughed and spluttered, wiping water out of her face. She grinned at me and grabbed my hand pulling me into the water too and before I knew what was happening the other two girls jumped on me too. I heard my mom shout back, still laughing. “You girls knew this would happen, you deal with the consequences!” I burst out laughing. Well I guess that’s what I get for falling asleep in front of three Peters girls, I should have known better! “You’d better run!” I shouted, streaking after one of them at random. It turned out to be Lisa so I smiled and pushed myself after her, making her scream at my mom for help, “Mom! Make him leave me alone!” The..."
562,"“You know most girls would be mad that they were thrown into a muddy lake.” Kyle said when I came up. “No I'm not.” I protested . “One day I will make you realize that you are better than you think, but until then will you have dinner with me and my family?” He said shaking his head. “Of course Big Bad Alpha.” I jokingly answered. “I'll show you big and bad. Hold your breath.” Ky sad picking me up and throwing me across the lake. I came up laughing . “Well if you two love birds are done I'm starving.” I hear from the left of me. I didn't even have to look to know it was Miles. I looked over Kyle's shoulder to see Miles standing there uninjured beside his arm in a sling. Miles must have saw me looking at it and said, “Don't worry it will be healed by the morning,” Miles always knew how to reassure me. “Lets go eat dinner.” Kyle said letting me jump on his back. As we got out of the lake Miles handed us a towel and we walked back inside to eat dinner. Sitting at the table wa..."
690,"Right? I said. ""Well, we were together this weekend,"" she smiled,""AndyouknowIreallylikehim!"" she said so fast that her words were all jumbled.. I rolled my eyes and ignored what she just said. He used her the first time, what makes her think he is going to change this time? I really hope she knows what she is doing. ""Say something!"" she frowned. ""Something!"" I said flatly. I shouldn't be mad at her, it is her choice and she can do whatever she wants to do. ""Seriously,"" she sighed,""You are my best friend and you should be happy with whatever decision I make!"" she pouted. That face doesn't work anymore, but she is right. ""Fine, you're right!"" I sighed,""Just be careful!"" I warned. She has no idea what she is getting herself into. Speaking of the devil, Kyle and his buddies were heading our way. ~ ""Why are you always so early to class?"" I heard from behind me. I smiled at the sound of his voice. Ethan pushed the door shut, stopping me from entering. ""The only way you would know that is..."
729,"Any guys? he asked, this time looking clearly amused. ""Nothing really...there's some good book stores around I like to go to"" I say, not glancing up once to see the smug look on his face. I continued to move around my food, mixing the scrambled eggs with the ketchup. ""I'm done"" he stated to both of us and slammed the door shut, leaving Gwen and I all alone. ""I'm going out"" I said standing from the kitchen table. I grabbed my bag from the coat rack. ""So what have you been up to these past 7 months?"" he questioned pointing his spoon in my direction. ""Must be!"" my sister said scooping her scrambled eggs in a heap and bringing it to her mouth. ""It felt pretty amazing, you know there was a rush of adrenaline but the feeling of knowing you might not come back alive is...a bit scary."" ""Your dying one minute at a time"" continued to play through my head during the entire night and throughout breakfast. He talked about his experience in Afghanistan for what seemed like hours but his previous..."
831,"HEY GUYS!!! Ok, this is my first book and I'm really just experimenting.. I don't know how often I will add more and where this book is going, I'm just going to type until I've finished it. it would be great if i could get feed back from you all! ;) hope you enjoy it! Chapter One- Goodbye AustraliaI flew my eyes open, allowing them to search around my once quiet room. 'BANG!' There it was again. My eyes darted left to right until they fell upon my door where I saw two paws shadowing at the crack of the bottom of it. I felt relief flood into me as I realised it was only TP. I rolled over and saw the sun streaming through the cracks of my dust covered blinds, it looked so warm like a perfect summers day, although I knew that was too good to be true. I then looked down and stared at my favourite purple suitcase that lay there unzipped on my messy floor, with all it's contents spilling out. ""Today's the day."" I thought out loud. I was going to boarding school. I have been dreading this..."
878,"Breakfast in bed, ma announced the obvious. ""Oh, we do spoil you."" Ma pushed open the door with a tray, singing happy birthday. I knew dad was getting my present from downstairs. Lindsay had come home from college specially to celebrate my birthday. Nate was the only one not home. I had planned our reunion for god knows how long, and he had gone and spoilt it. I had a feeling they were all keeping something from me. But I didn't say any more. They, like Nate, had just been back from the one-year-war, and they needed some rest. With their girlfriends. The war had ended - a short but cruel war. My boyfriend, Nate, was meant to be back from the army a week ago. I'd been questioning his friends about the delay, but they said it was just 'army traffic'. Except I couldn't relax. Se-ven-teen. I tasted the word on my tongue. It was the age between two milestones - sixteen and eighteen. Wedging itself in between the two most important ages. I was glad it was there because,..."


## Evaluation

In [None]:
ranks = []
second_ranks = []
for doc_id in range(len(tagged_docs)):
    inferred_vector = doc2vec_model.infer_vector(tagged_docs[doc_id].words)
    sims = doc2vec_model.docvecs.most_similar([inferred_vector], topn=len(doc2vec_model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [None]:
#count how each document ranks with respect to the training corpus
counter = collections.Counter(ranks)
print(counter)
counter.most_common()

#9/997 have very close document similiarities similiarities

[(70654, 2),
 (80884, 2),
 (55690, 2),
 (1468, 2),
 (68366, 2),
 (3166, 1),
 (33751, 1),
 (83506, 1),
 (77659, 1),
 (81386, 1),
 (62865, 1),
 (71331, 1),
 (25530, 1),
 (44285, 1),
 (84167, 1),
 (49428, 1),
 (74315, 1),
 (68493, 1),
 (75641, 1),
 (42277, 1),
 (76755, 1),
 (63875, 1),
 (43617, 1),
 (35963, 1),
 (32134, 1),
 (65159, 1),
 (13118, 1),
 (34808, 1),
 (63876, 1),
 (4893, 1),
 (13994, 1),
 (42331, 1),
 (39165, 1),
 (51601, 1),
 (51034, 1),
 (24444, 1),
 (56390, 1),
 (14123, 1),
 (98258, 1),
 (73052, 1),
 (2780, 1),
 (57353, 1),
 (56185, 1),
 (81544, 1),
 (68105, 1),
 (50074, 1),
 (16480, 1),
 (20652, 1),
 (88965, 1),
 (10130, 1),
 (50418, 1),
 (68989, 1),
 (96513, 1),
 (44994, 1),
 (5153, 1),
 (30096, 1),
 (53585, 1),
 (76398, 1),
 (79473, 1),
 (43076, 1),
 (47048, 1),
 (34963, 1),
 (25692, 1),
 (21777, 1),
 (85426, 1),
 (15939, 1),
 (29234, 1),
 (94652, 1),
 (65075, 1),
 (30563, 1),
 (43320, 1),
 (80204, 1),
 (16327, 1),
 (74502, 1),
 (74290, 1),
 (25286, 1),
 (91925, 1),
 (35