In [1]:
# coding: utf-8

import os, glob, re, io, random, gensim, smart_open, logging, collections
import numpy as np
import pandas as pd

from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text as fallback_text_extraction
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from nltk.tokenize import word_tokenize

pdfReaders = []
pdfFiles = []
docLabels = []

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
rootDir = "/Users/tillman/t-root/dev/projects/2022/pdf-correlator/gitignored"
txtExtractDir = rootDir + '/txt-extractions'
modelDataDir = rootDir + '/model-data'
zoteroDir = '/Users/tillman/t-root/zotero/storage'


class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [2]:
# read files
print("reading pdfs in" + str(rootDir) + " (including subdirectories)")
def read_files():
    os.chdir(rootDir)
    for file in glob.glob("**/*.pdf"):
        try:
            pdfFiles.append(file)
            pdfReaders.append(PdfReader(file))
        except:
            print(bcolors.FAIL + "error: " + file + " is unreadable by glob.glob. Skipping file" + bcolors.ENDC)
    print(bcolors.OKGREEN + "pdf files read" + bcolors.ENDC)
    print()
read_files()

reading pdfs in/Users/tillman/t-root/dev/projects/2022/pdf-correlator/gitignored (including subdirectories)
[91merror: pdf-tests/Simeone et al_2018_Arts and design as translational mechanisms for academic entrepreneurship.pdf is unreadable by glob.glob. Skipping file[0m
[92mpdf files read[0m



In [3]:
# extract text from pdfs to designated directory and save as txt files.
def extract_to_txt():
    print("Extracting pdfs to text files (duplicate pdfs are handled automagically)")
    os.chdir(txtExtractDir)
    counter = 0
    text = ""
    for i in pdfReaders:
        counter += 1
        with open(str([i.metadata.title]) + ".txt", 'w', encoding="utf-8") as file:
      
            # add doc title to array for reference / tagging
            docLabels.append(i.metadata.title)
            try:
                for j in range(len(i.pages)):
                    # format txt file so that each document is one one line (doc2vec requirement)
                    text += i.getPage(j).extract_text()
                    text = "".join(line.strip("\n") for line in text)  

                
                    
            except Exception as exc:
                print(bcolors.FAIL + "error: " + "\"" + str(i.metadata.title) + "\"" + " not extractable with PyPDF2, trying with pdfminer" + bcolors.ENDC)
                print()
                # format txt file so that each document is one one line (doc2vec requirement)
                text += fallback_text_extraction(rootDir + "/" + pdfFiles[counter])
                text = "".join(line.strip("\n") for line in text)     
                
 
            file.write(text)
    print(bcolors.OKGREEN + "pdf extraction complete" + bcolors.ENDC)
extract_to_txt()


Extracting pdfs to text files (duplicate pdfs are handled automagically)
[91merror: "None" not extractable with PyPDF2, trying with pdfminer[0m

[92mpdf extraction complete[0m


In [4]:
# generate a training corpus from all txt files found in designated directory
class CorpusGen(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self, tokens_only=False):
        counter = 0
        for fname in os.listdir(self.dirname):
            
            with smart_open.open(fname, encoding="iso-8859-1") as f:
                for i, line in enumerate(f):
                    tokens = gensim.utils.simple_preprocess(line, min_len=3, max_len=20, deacc=True)
                    if tokens_only:
                        yield tokens
                    else:
                        yield gensim.models.doc2vec.TaggedDocument(tokens, [counter])
            counter += 1
        
trainCorpus = list(CorpusGen('/Users/tillman/t-root/dev/projects/2022/pdf-correlator/gitignored/txt-extractions'))

In [5]:
# establish a model and build the vocab
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(trainCorpus)
model.train(trainCorpus, total_examples=model.corpus_count, epochs=model.epochs)

2022-09-28 17:34:40,353 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>', 'datetime': '2022-09-28T17:34:40.353871', 'gensim': '4.2.0', 'python': '3.10.6 | packaged by conda-forge | (main, Aug 22 2022, 20:41:22) [Clang 13.0.1 ]', 'platform': 'macOS-12.0.1-arm64-arm-64bit', 'event': 'created'}
2022-09-28 17:34:40,354 : INFO : collecting all words and their counts
2022-09-28 17:34:40,354 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2022-09-28 17:34:40,408 : INFO : collected 13122 word types and 12 unique tags from a corpus of 12 examples and 614530 words
2022-09-28 17:34:40,409 : INFO : Creating a fresh vocabulary
2022-09-28 17:34:40,429 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 13119 unique words (99.98% of original 13122, drops 3)', 'datetime': '2022-09-28T17:34:40.429008', 'gensim': '4.2.0', 'python': '3.10.6 | packaged by conda-forge | (main, Aug 22 2022, 20:41:22) [Clang 13.0.1 ]

In [11]:
# generate and format data files for tensorboard visualisation
os.chdir(modelDataDir)
model.save_word2vec_format('doc_tensor.w2v', doctag_vec=True, word_vec=False)
%run ../scripts/word2vec2tensor.py -i doc_tensor.w2v -o pdf_plot

text = ""    
with open('pdf_plot_metadata.tsv', 'w') as file:
    file.write('title\n')
    for fname in os.listdir(txtExtractDir):
        if fname.endswith('.txt'):
            text = fname
            text = re.sub('\[\'', '', text)
            text = re.sub('\'\].txt', '', text)
            text = re.sub('\[', '', text)
            text = re.sub('\].txt', '', text)     
            print(text)
            file.write("%s\n" % text)
        else:
            continue
        

2022-09-28 17:36:04,211 : INFO : storing 12x50 projection weights into doc_tensor.w2v
2022-09-28 17:36:04,220 : INFO : running ../scripts/word2vec2tensor.py -i doc_tensor.w2v -o pdf_plot
2022-09-28 17:36:04,221 : INFO : loading projection weights from doc_tensor.w2v
2022-09-28 17:36:04,223 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (12, 50) matrix of type float32 from doc_tensor.w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-09-28T17:36:04.223864', 'gensim': '4.2.0', 'python': '3.10.6 | packaged by conda-forge | (main, Aug 22 2022, 20:41:22) [Clang 13.0.1 ]', 'platform': 'macOS-12.0.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}
2022-09-28 17:36:04,225 : INFO : 2D tensor file saved to pdf_plot_tensor.tsv
2022-09-28 17:36:04,226 : INFO : Tensor metadata file saved to pdf_plot_metadata.tsv
2022-09-28 17:36:04,226 : INFO : finished running word2vec2tensor.py


IAFOR Journal of Cultural Studies – Volume 6 – Issue 1 
frvir-2022-779148 1..5
ShareVR: Enabling Co-Located Experiences for Virtual Reality between HMD and Non-HMD Users
The effects of visual context and individual differences on perception and evaluation of modern art and graffiti art
User attention and behaviour in virtual reality art encounter
Microsoft Word - 48710116.DOC
pone.0099019 1..8
Making Art Therapy Virtual: Integrating Virtual Reality Into Art Therapy With Adolescents
Microsoft Word - CHI2018_LucidDreaming_v5.docx
g5grap.lo
None


In [15]:
# word occurence check
checkWord = "internet"
print("\"" + str(checkWord) + "\"" + " appears this many times in corpus:")
print({model.wv.get_vecattr(checkWord, 'count')})
print()

"internet" appears this many times in corpus:
{102}



In [8]:
# infer a vector from corupus (I dont actually know (yet) what this means or does! :D )
print("infering a default vector")
print()
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])

infering a default vector



In [9]:
# save the entire corpus to a txt file
with open(modelDataDir + "/" + "train-corpus.txt", 'w') as file:
    file.write(str(trainCorpus))

In [12]:
# assessing the model
print("assessing the model (this can take a while)")
ranks = []
secondRanks = []
for doc_id in range(len(trainCorpus)):
        inferredVector = model.infer_vector(trainCorpus[doc_id].words)
        sims = model.dv.most_similar([inferredVector], topn=len(model.dv))
        rank = [docid for docid, sim in sims].index(doc_id)
        ranks.append(rank)
        secondRanks.append(sims[1])

counter = collections.Counter(ranks)
print(bcolors.OKGREEN + "model assessed, all is well in computer land" + bcolors.ENDC)

assessing the model (this can take a while)
[92mmodel assessed[0m

Counter({0: 3, 3: 3, 8: 1, 7: 1, 9: 1, 5: 1, 6: 1, 1: 1})
