In [1]:
# coding: utf-8

import os, glob, re, io, random, gensim, smart_open, logging, collections
import numpy as np

from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text as fallback_text_extraction
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from nltk.tokenize import word_tokenize

pdfReaders = []
pdfFiles = []
docLabels = []

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
rootDir = "/Users/tillman/t-root/dev/projects/2022/pdf-correlator/gitignored"
txtExtractDir = "/Users/tillman/t-root/dev/projects/2022/pdf-correlator/gitignored/txt-extractions/"
zoteroDir = '/Users/tillman/t-root/zotero/storage'

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [2]:
# read files
print("reading pdfs in" + str(rootDir) + " (including subdirectories)")
def read_files():
    os.chdir(rootDir)
    for file in glob.glob("**/*.pdf"):
        try:
            pdfFiles.append(file)
            pdfReaders.append(PdfReader(file))
        except:
            print(bcolors.FAIL + "error: " + file + " is unreadable by glob.glob. Skipping file" + bcolors.ENDC)
    print(bcolors.OKGREEN + "pdf files read" + bcolors.ENDC)
    print()
read_files()

reading pdfs in/Users/tillman/t-root/dev/projects/2022/pdf-correlator/gitignored (including subdirectories)
[91merror: pdf-tests/Simeone et al_2018_Arts and design as translational mechanisms for academic entrepreneurship.pdf is unreadable by glob.glob. Skipping file[0m
[92mpdf files read[0m



In [28]:
# extract text from pdfs to designated directory and save as txt files.
def extract_to_txt():
    os.chdir(txtExtractDir)
    pat0 = ('(?<!Dr)(?<!Esq)\. +(?=[A-Z])')
    pat1 = ('\.+(?=[A-Z])')
    pat2 = ('\.+(?=[0-9])')
    pat3 = ('\. +(?=[0-9])')
    pat4 = ('(?=[for, a, of, the, and, to, in])')

    patterns = [pat0, pat1, pat2, pat3, pat4]
    counter = 0
    text = ""
    for i in pdfReaders:
        counter += 1
        with open(str([i.metadata.title]) + ".txt", 'w', encoding="utf-8") as file:

            # add doc title to array for reference / tagging
            docLabels.append(i.metadata.title)
            print("excracting: " + str(i.metadata.title))
            try:
                for j in range(len(i.pages)):
                    # format txt file so that each document is one one line (doc2vec requirement)
                    text += i.getPage(j).extract_text()
                    text = "".join(line.strip("\n") for line in text)  


            except Exception as exc:
                print(bcolors.FAIL + "error: " +  "pdf not extractable with PyPDF2, trying with pdfminer" + bcolors.ENDC)
                
                # format txt file so that each document is one one line (doc2vec requirement)
                text += fallback_text_extraction(rootDir + "/" + pdfFiles[counter])
                text = "".join(line.strip("\n") for line in text) 
                
            file.write(text)
            print()


extract_to_txt()


excracting: pone.0099019 1..8

excracting: None

excracting: User attention and behaviour in virtual reality art encounter

excracting: Microsoft Word - CHI2018_LucidDreaming_v5.docx

excracting: Microsoft Word - 48710116.DOC

excracting: frvir-2022-779148 1..5

excracting: Making Art Therapy Virtual: Integrating Virtual Reality Into Art Therapy With Adolescents

excracting: None

excracting: Making Art Therapy Virtual: Integrating Virtual Reality Into Art Therapy With Adolescents

excracting: ShareVR: Enabling Co-Located Experiences for Virtual Reality between HMD and Non-HMD Users

excracting: None

[91merror: pdf not extractable with PyPDF2, trying with pdfminer[0m

excracting: The effects of visual context and individual differences on perception and evaluation of modern art and graffiti art

excracting: IAFOR Journal of Cultural Studies – Volume 6 – Issue 1 

excracting: g5grap.lo

excracting: User attention and behaviour in virtual reality art encounter

excracting: ShareVR: En

In [49]:
# generate a training corpus from all txt files found in designated directory
class CorpusGen(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self, tokens_only=False):
        counter = 0
        for fname in os.listdir(self.dirname):
            counter += 1
            with smart_open.open(fname, encoding="iso-8859-1") as f:
                for i, line in enumerate(f):
                    tokens = gensim.utils.simple_preprocess(line, min_len=3, max_len=15, deacc=True)
                    if tokens_only:
                        yield tokens
                    else:
                        yield gensim.models.doc2vec.TaggedDocument(tokens, [counter])

trainCorpus = list(CorpusGen('/Users/tillman/t-root/dev/projects/2022/pdf-correlator/gitignored/txt-extractions'))

In [50]:
# establish a model and build the vocab
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=1, epochs=40)
model.build_vocab(trainCorpus)

2022-09-28 14:27:52,193 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d50,n5,w5,s0.001,t3>', 'datetime': '2022-09-28T14:27:52.193847', 'gensim': '4.2.0', 'python': '3.10.6 | packaged by conda-forge | (main, Aug 22 2022, 20:41:22) [Clang 13.0.1 ]', 'platform': 'macOS-12.0.1-arm64-arm-64bit', 'event': 'created'}
2022-09-28 14:27:52,195 : INFO : collecting all words and their counts
2022-09-28 14:27:52,197 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2022-09-28 14:27:52,328 : INFO : collected 12549 word types and 14 unique tags from a corpus of 13 examples and 1218080 words
2022-09-28 14:27:52,328 : INFO : Creating a fresh vocabulary
2022-09-28 14:27:52,348 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=1 retains 12549 unique words (100.00% of original 12549, drops 0)', 'datetime': '2022-09-28T14:27:52.348348', 'gensim': '4.2.0', 'python': '3.10.6 | packaged by conda-forge | (main, Aug 22 2022, 20:41:22) [Clang 13.0.1 ]',

In [51]:
# word occurence check
checkWord = "the"
print("\"" + str(checkWord) + "\"" + " appears this many times in corpus:")
print({model.wv.get_vecattr(checkWord, 'count')})
print()
model.train(trainCorpus, total_examples=model.corpus_count, epochs=model.epochs)


2022-09-28 14:27:57,127 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 12549 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-09-28T14:27:57.127926', 'gensim': '4.2.0', 'python': '3.10.6 | packaged by conda-forge | (main, Aug 22 2022, 20:41:22) [Clang 13.0.1 ]', 'platform': 'macOS-12.0.1-arm64-arm-64bit', 'event': 'train'}
2022-09-28 14:27:57,205 : INFO : EPOCH 0: training on 1218080 raw words (114501 effective words) took 0.1s, 1576556 effective words/s
2022-09-28 14:27:57,258 : INFO : EPOCH 1: training on 1218080 raw words (114458 effective words) took 0.1s, 2174091 effective words/s
2022-09-28 14:27:57,311 : INFO : EPOCH 2: training on 1218080 raw words (114478 effective words) took 0.1s, 2217330 effective words/s


"the" appears this many times in corpus:
{81296}



2022-09-28 14:27:57,367 : INFO : EPOCH 3: training on 1218080 raw words (114461 effective words) took 0.1s, 2072402 effective words/s
2022-09-28 14:27:57,422 : INFO : EPOCH 4: training on 1218080 raw words (114457 effective words) took 0.1s, 2152631 effective words/s
2022-09-28 14:27:57,477 : INFO : EPOCH 5: training on 1218080 raw words (114469 effective words) took 0.1s, 2181671 effective words/s
2022-09-28 14:27:57,529 : INFO : EPOCH 6: training on 1218080 raw words (114444 effective words) took 0.1s, 2226625 effective words/s
2022-09-28 14:27:57,583 : INFO : EPOCH 7: training on 1218080 raw words (114476 effective words) took 0.1s, 2200372 effective words/s
2022-09-28 14:27:57,633 : INFO : EPOCH 8: training on 1218080 raw words (114454 effective words) took 0.0s, 2390516 effective words/s
2022-09-28 14:27:57,685 : INFO : EPOCH 9: training on 1218080 raw words (114465 effective words) took 0.1s, 2217585 effective words/s
2022-09-28 14:27:57,735 : INFO : EPOCH 10: training on 1218080

In [39]:
# infer a vector from corupus (I dont actually know (yet) what this means or does! :D )
print("infering a default vector")
print()
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])

infering a default vector



In [40]:
# save the entire corpus to a txt file
with open(txtExtractDir + "traincorpus.txt", 'w') as file:
    file.write(str(trainCorpus))

In [42]:
# assessing the model
print("assessing the model (this can take a while)")
print()
ranks = []
secondRanks = []
for doc_id in range(len(trainCorpus)):
        inferredVector = model.infer_vector(trainCorpus[doc_id].words)
        sims = model.dv.most_similar([inferredVector], topn=len(model.dv))
        rank = [docid for docid, sim in sims].index(doc_id)
        ranks.append(rank)
        secondRanks.append(sims[1])

counter = collections.Counter(ranks)


assessing the model (this can take a while)



In [43]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(trainCorpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(trainCorpus[sims[index][0]].words)))

print()
print(bcolors.OKGREEN + "doc2vec training and assessment successful" + bcolors.ENDC)


SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,s0.001,t3>:

MOST (7, 0.9997449517250061): «art time and space context modulates the relationbetween art experience and viewing timedavid brieber marcos nadal helmut leder raphael rosenberg university vienna department basic psychological research and research methods faculty psychology vienna austria university vienna department art history faculty historical and cultural studies vienna experience art emerges from the interaction various cognitive and affective processes the unfolding theseprocesses time and their relation with viewing behavior however still poorly understood here examined the effectof context the relation between the experience art and viewing time the most basic indicator viewing behavior two groups participants viewed art exhibition one two contexts one the museum the other the laboratory both cases viewing time was recorded with mobile eye tracking system after freely viewing the exhibition each artwork lik

IndexError: list index out of range