## Import Libraries

In [1]:
import logging
import pandas as pd
import numpy as np
import os
import json
import urllib.request
import pathlib

import requests
from bs4 import BeautifulSoup
import PyPDF2
import re
#from fuzzywuzzy import fuzz

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
import io
import os

desired_width = 320
pd.set_option("display.max_columns", 20)
pd.set_option("display.width", desired_width)

In [218]:
# ===== START LOGGER =====
logger = logging.getLogger(__name__)
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
sh = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
sh.setFormatter(formatter)
root_logger.addHandler(sh)

## Web Scraping
In this step, we use the download URL to download the paper itself and save it using the clean title.

In [2]:
lda_df = pd.read_csv("outputs/lda_df.csv", index_col=0)
path = '/Users/marisolhernandez/Desktop/SKAEL/Network Diagram/files/'

In [3]:
def download_file(download_url, filename):
    response = urllib.request.urlopen(download_url)    
    file = open(path + filename + ".pdf", 'wb')
    file.write(response.read())
    file.close()

In [4]:
# Download files
for i, row in lda_df.iterrows():
    title = row['clean title']
    #download_url = row['url']
    
    try:
        #download_file(download_url, title)
        lda_df.loc[[i],'filename'] = title + '.pdf'
        
    except:
        lda_df.loc[[i],'filename'] = ''
        print('Failed\nindex: ' + str(i) + '\ntitle: ' + str(row['clean title']) + '\nurl: ' + str(row['url']) + '\n')

In [5]:
# Fill missing value
lda_df = lda_df.fillna("")

## Add Citations Data
We open and read each paper searching for the titles and authors of the remaining papers. If the title or author of another paper is found, then a connection is created and established between the two papers.

In [6]:
# Add citation data to the DataFrame
cite_links = list()
counter = 0
cited_by = {str(i): [] for i in range(len(lda_df))}

In [7]:
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

for i, row in lda_df.iterrows():
    try:
        # Read the pages of each file
        full_path = path + row['filename']
        fp = open(full_path, 'rb')
        
        pages = ''

        page_no = 0
        for pageNumber, page in enumerate(PDFPage.get_pages(fp)):
            if pageNumber == page_no:
                interpreter.process_page(page)

                data = retstr.getvalue().lower()
                data = ' ' + data
                pages += data
                data = ''
                retstr.truncate(0)
                retstr.seek(0)

            page_no += 1
            
            
        # Add citation data to the DataFrame
        conn_nodes = list()
        
        
        for title in lda_df['title']:
            if title.lower() in pages:
                tgt_id = str(lda_df[lda_df['title'] == title].index[0])
                if str(i) != tgt_id:
                    conn_nodes.append(tgt_id)
                    cited_by[tgt_id].append(str(i))
                    counter += 1
        for author in lda_df['authors abbrev']:
            if author.lower() in pages:
                tgt_id = str(lda_df[lda_df['authors abbrev'] == author].index[0])
                if str(i) != tgt_id:
                    conn_nodes.append(tgt_id)
                    cited_by[tgt_id].append(str(i))
                    counter += 1            
        
        cite_links.append(",".join(conn_nodes))           
        
    except:
        cite_links.append("")

In [8]:
#logger.info(f"Setting new columns for citations, journal & publication date")
network_df = lda_df.assign(citations=cite_links)
network_df["citations"] = network_df["citations"].fillna("")
#network_df = network_df.assign(journal=journals)
#network_df = network_df.assign(pub_date=pub_dates)
#network_df = network_df.assign(authors=authors)

cited_by_list = [",".join(cited_by[str(i)]) for i in range(len(network_df))]
network_df = network_df.assign(cited_by=cited_by_list)
network_df = network_df.assign(
    n_cites=[len(i.split(",")) if len(i) > 0 else 0 for i in cited_by_list]
)

for col in ["title", "department", "pub_date", "authors", "authors abbrev"]:
    network_df[col] = network_df[col].fillna("No data")

## Output File
Before we output the file, I remove the duplicated in the cited by and citations.

In [11]:
# Remove duplicates in cited by and citations
for i in range(0, network_df.shape[0]):
    cited_by = network_df.loc[[i],'cited_by'].values[0]
    citations = network_df.loc[[i],'citations'].values[0]
    
    if cited_by == '':
        network_df.loc[[i],'cited_by'] = ''
        network_df.loc[[i],'n_cites'] = 0
    else:
        cited_by = list(set(cited_by.split(',')))
        network_df.loc[[i],'cited_by'] = ",".join(cited_by)
        network_df.loc[[i],'n_cites'] = len(cited_by)
        
    if citations == '':
        network_df.loc[[i],'citations'] = ''
    else:
        citations = list(set(citations.split(',')))
        network_df.loc[[i],'citations'] = ",".join(citations)

In [14]:
network_df_file = "/Users/marisolhernandez/Desktop/SKAEL/Network Diagram/outputs/network_df.csv"
network_df.to_csv(network_df_file)