In [1]:
! pip install pymupdf

Defaulting to user installation because normal site-packages is not writeable


In [2]:
! pip install iso-639

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import fitz
from iso639 import languages

In [4]:
import pandas as pd
import numpy as np
import pprint
import PyPDF2
import pdfminer
import pymc3 as pm
import os
import glob
import random
from langdetect import detect
from langdetect import detect_langs

In [5]:
def randomFiles(location):
    randomFilesDF = pd.DataFrame()
    for i in location:
        pdffiles = sorted(glob.glob(i + '*.pdf'))
        pdffiles = pd.DataFrame({'filePath' : pdffiles})
        result = pdffiles.sample(40,  random_state = np.random.RandomState(10))
        randomFilesDF = pd.concat([randomFilesDF, result])
    return randomFilesDF

In [6]:
randomFilesDF = randomFiles(["/scratch/yte9pc/InternetArchive/Datasets/fatcat_longtail_lang/", 
             "/scratch/yte9pc/InternetArchive/Datasets/fatcat_pdf/",
             "/scratch/yte9pc/InternetArchive/Datasets/gwb_random_pdf/"])

In [7]:
BayesDF = pd.read_csv('Final_BayesProject_PDFs.csv')
BayesDF.columns = ['index', 'filePath', 'researchPublication']
BayesDF = BayesDF.drop(columns = ['index'])
BayesDF

Unnamed: 0,filePath,researchPublication
0,/scratch/yte9pc/InternetArchive/Datasets/fatca...,0
1,/scratch/yte9pc/InternetArchive/Datasets/fatca...,1
2,/scratch/yte9pc/InternetArchive/Datasets/fatca...,0
3,/scratch/yte9pc/InternetArchive/Datasets/fatca...,0
4,/scratch/yte9pc/InternetArchive/Datasets/fatca...,1
...,...,...
115,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,0
116,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,0
117,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,0
118,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,0


In [47]:
class ParsePDF:
    def __init__(self, pdfPath):
        self.pdfPath = pdfPath
        self.fileName = None
        self.doc = None
        self.numPages = None
        self.toc = None
        self.metadata = None
        self.format = None
        self.title = None
        self.author = None
        self.subject = None
        self.creator = None
        self.producer = None
        self.allText = ''
        self.pageText = ''
        self.parsable = None
    
    def getMetaData(self):
        self.fileName = self.pdfPath.split('/')[-1]
        try:
            # Open the PDF
            self.doc = fitz.open(self.pdfPath)
            self.numPages = self.doc.pageCount
            if not self.doc.getToC():
                self.toc = False
            else:
                self.toc = True
            self.metadata = self.doc.metadata
            self.png = self.doc.loadPage(0).getPixmap()
            self.format = self.metadata.get('format')
            self.title = self.metadata.get('title')
            self.author = self.metadata.get('author')
            self.subject = self.metadata.get('subject')
            self.creator = self.metadata.get('creator')
            self.producer = self.metadata.get('producer')
            self.doc.close()
            return [self.pdfPath, self.fileName, self.numPages, self.toc, 
                    self.format, self.title, self.author, self.subject, self.creator, 
                    self.producer, self.png.height, self.png.width, self.png.size]
        except Exception:
            return [self.pdfPath, self.fileName] + [np.nan] * 11
    
    def getTOC(self):
        self.doc = fitz.open(self.pdfPath)
        try:
            if not doc.getToC():
                return np.nan
            else:
                return doc.getToC()
        except Exception:
            return np.nan
        
    def getPageImage(self, pageNum):
        self.doc = fitz.open(self.pdfPath)
        try:
            if pageNum <= self.doc.pageCount:
                png = self.doc.loadPage(pageNum).getPixmap()
                png.writeImage("%s-%i.png" % (self.pdfPath.split('/')[-1].strip('.png'), pageNum))
                return 'Image Saved'
        except Exception:
            return 'Error getting image'
        
    def getImageSpecs(self, pageNum):
        self.doc = fitz.open(self.pdfPath)
        try:
            if pageNum <= self.doc.pageCount:
                png = self.doc.loadPage(pageNum).getPixmap()
                return [png.height, png.width, png.size]
        except Exception:
            return [np.nan] * 3
        
    # https://pymupdf.readthedocs.io/en/latest/vars.html#textpreserve
    def getAllText(self, opt = 'text'):
        self.doc = fitz.open(self.pdfPath)
        for i in range(0, self.doc.pageCount):
            self.allText += self.doc.loadPage(i).getText(opt, flags = fitz.TEXT_DEHYPHENATE)\
                            .replace('\n', ' ').strip() + ' '
        return self.allText
    
    def getPageText(self, pageNum, opt = 'text'):
        self.doc = fitz.open(self.pdfPath)
        try:
            if pageNum <= self.doc.pageCount:
                self.pageText = self.doc.loadPage(pageNum).getText(opt, flags = fitz.TEXT_DEHYPHENATE)\
                                .replace('\n', ' ').strip()
                return self.pageText
        except Exception:
            return self.pageText 

In [48]:
def metaData(PDFs):
    metaDataDF = pd.DataFrame()
    for pdf in PDFs:
        p = ParsePDF(pdf)
        df = pd.DataFrame(p.getMetaData(), 
            index = ['filePath', 'fileName', 'numPages', 'toc', 'format', 'title', 
                     'author', 'subject', 'creator', 'producer', 
                     'height', 'width', 'size']).transpose()
        metaDataDF = metaDataDF.append(df, ignore_index = True)
    return metaDataDF

In [49]:
%time meta = metaData(BayesDF.filePath.values)

mupdf: expected object number
mupdf: No default Layer config
mupdf: cannot find startxref


CPU times: user 2.84 s, sys: 127 ms, total: 2.97 s
Wall time: 3.08 s


__*Some files are not in english*__

- Words that represent the structure of a paper
    - abtract, introduction, conclusion, reference, table of content
- Words that represent the content of a paper
    - research, analyze, result, table, investigation, explain, theory, study, paper, data, perform
- Words that represent association
    - journal, association, organization, doi, university, school, board

In [50]:
struct = ['abstract', 'introduction', 'conclusion', 'reference', 'table of content']
content = ['research', 'analyze', 'result', 'table', 'investigation', 'explain', 'theor', 'stud', 'paper', 'data', 'perform']
assoc = ['journal', 'association', 'assoc', 'organization', 'doi', 'university', 'school', 'board', 'publish']
types = {'structure' : struct, 'content' : content, 'association' : assoc}

In [51]:
def searchFor(listWords):
    words = []
    PDFs = meta.filePath.values
    for pdf in PDFs:
        p = ParsePDF(pdf)
        text = p.getAllText()
        textLength.append(len(text))
        fileWordCount = []
        for i in listWords:
            fileWordCount.append(text.lower().count(i))
        words.append(fileWordCount)
    meta['numChar'] = textLength
    meta['words'] = textLength
    return words

def searchFor(dictionaryWords):
    lang = []
    textLength = []
    typeCount = {'structure' : [], 'content' : [], 'association' : []}
    PDFs = meta.filePath.values
    for pdf in PDFs:
        p = ParsePDF(pdf)
        text = p.getAllText()
        try:
            lang.append(languages.get(alpha2 = detect(text)).name)
        except:
            lang.append(np.nan)
        textLength.append(len(text))
        for i in dictionaryWords:
            wordTypeCount = 0
            for j in dictionaryWords[i]:
                wordTypeCount += text.lower().count(j)
            typeCount[i].append(wordTypeCount)
            
    meta['structure'] = typeCount['structure']
    meta['content'] = typeCount['content']
    meta['association'] = typeCount['association']
    meta['language'] = lang
    meta['numChar'] = textLength

In [52]:
%time searchFor(types)

mupdf: expected object number
mupdf: No default Layer config
mupdf: cannot find startxref


CPU times: user 8.45 s, sys: 134 ms, total: 8.58 s
Wall time: 8.66 s


In [53]:
dataset = meta[['filePath', 'numPages', 'height', 'width', 'size', 'structure', 'content', 'association', 'language', 'numChar']]

In [54]:
dataset = dataset.merge(BayesDF, left_on = 'filePath', right_on = 'filePath')

In [56]:
finalDataset = dataset[~dataset.numPages.isnull()]

In [57]:
finalDataset

Unnamed: 0,filePath,numPages,height,width,size,structure,content,association,language,numChar,researchPublication
0,/scratch/yte9pc/InternetArchive/Datasets/fatca...,2,828,602,1495456,0,0,0,Polish,6943,0
1,/scratch/yte9pc/InternetArchive/Datasets/fatca...,10,794,595,1417378,2,20,50,Korean,27885,1
2,/scratch/yte9pc/InternetArchive/Datasets/fatca...,14,1243,773,2882605,0,0,0,,14,0
3,/scratch/yte9pc/InternetArchive/Datasets/fatca...,4,794,595,1417378,2,1,4,English,9119,0
4,/scratch/yte9pc/InternetArchive/Datasets/fatca...,4,842,595,1503058,0,71,2,Norwegian,25615,1
...,...,...,...,...,...,...,...,...,...,...,...
115,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,12,842,596,1505584,0,0,0,English,17184,0
116,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,4,842,596,1505584,2,59,1,English,5984,0
117,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,2,842,596,1505584,0,1,1,English,4240,0
118,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,2,595,841,1501273,1,4,0,English,3803,0


In [72]:
def dfmap(i):
    langeCode = []
    if str(i) in ('English'):
        return '1'
    elif str(i) in ('French', 'Spanish', 'Portuguese', 'Romanian'):
        return '2'
    else:
        return '3'

In [76]:
finalDataset.language = finalDataset.language.apply(dfmap)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [77]:
finalDataset

Unnamed: 0,filePath,numPages,height,width,size,structure,content,association,language,numChar,researchPublication
0,/scratch/yte9pc/InternetArchive/Datasets/fatca...,2,828,602,1495456,0,0,0,3,6943,0
1,/scratch/yte9pc/InternetArchive/Datasets/fatca...,10,794,595,1417378,2,20,50,3,27885,1
2,/scratch/yte9pc/InternetArchive/Datasets/fatca...,14,1243,773,2882605,0,0,0,3,14,0
3,/scratch/yte9pc/InternetArchive/Datasets/fatca...,4,794,595,1417378,2,1,4,1,9119,0
4,/scratch/yte9pc/InternetArchive/Datasets/fatca...,4,842,595,1503058,0,71,2,3,25615,1
...,...,...,...,...,...,...,...,...,...,...,...
115,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,12,842,596,1505584,0,0,0,1,17184,0
116,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,4,842,596,1505584,2,59,1,1,5984,0
117,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,2,842,596,1505584,0,1,1,1,4240,0
118,/scratch/yte9pc/InternetArchive/Datasets/gwb_r...,2,595,841,1501273,1,4,0,1,3803,0


In [78]:
finalDataset.to_csv('FinalDatasetBayes.csv')