# 'Human Brain Mapping' Journal Analysis

This notebook crawls 'Human brain mapping' articles and construct a knowledge graph based on these articles. 

Import libraries

In [123]:
import os
import sys
import time

import bs4
import mechanicalsoup
import pandas as pd
import progressbar
import requests
import scrapy
import tqdm
from bs4 import BeautifulSoup
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

sys.path.append(
    os.path.join(
        os.path.abspath(""),
        "../../../work/git_repos/EMA/equipment-advisor-diagnosis-model-generation/utils",
    )
)

In [124]:
from util import getlog

logger = getlog(
    "humanBrainMapping", dir=os.path.join(os.path.abspath("")), level="DEBUG"
)

Download full texts of 'Human Brain Mapping' articles from [this link](https://www.ncbi.nlm.nih.gov/pmc/journals/3754/)

In [156]:
def getFullTextHBM(htmlFile, domain="https://www.ncbi.nlm.nih.gov"):
    """function for downloading full texts of HBM"""
    with open(os.path.join(os.path.abspath(""), htmlFile,), "r",) as f:
        soup = BeautifulSoup(f, "html.parser")
    allissues = soup.find_all("a", class_="arc-issue")

    issuesMetadata = []
    barall = progressbar.ProgressBar(len(allissues[:90]))  # recent 5 years issues
    barall.start()
    for i1 in range(len(allissues[:90])):
        i = allissues[i1]
        meta = dict()
        browser = webdriver.Chrome(
            os.path.join(
                os.path.abspath(""),
                "../../../work/git_repos/EMA/equipment-advisor-diagnosis-model-generation/data/scrapingTools/chromedriver",
            )
        )
        browser.get(domain + i.attrs["href"])
        temp = browser.page_source
        so = BeautifulSoup(temp)
        browser.quit()
        allpapers = so.find_all("div", class_="rprt")
        meta["issue"] = i.contents[0]
        if not os.path.exists(
            os.path.join(os.path.abspath(""), "corpus", meta["issue"].strip(": "))
        ):
            os.makedirs(
                os.path.join(os.path.abspath(""), "corpus", meta["issue"].strip(": "))
            )
        meta["url"] = domain + i.attrs["href"]
        meta["time"] = str(i).split("<br/>")[1].strip("</a>")
        meta["articles"] = []
        logger.info("downloading issue " + meta["issue"])
        logger.info(meta)
        bar = progressbar.ProgressBar(len(allpapers))
        bar.start()
        chrome_options = Options()
        chrome_options.add_experimental_option(
            "prefs",
            {
                "download.default_directory": os.path.join(
                    os.path.abspath(""), "corpus", meta["issue"].strip(": ")
                ),
                "download.prompt_for_download": False,
                "download.directory_upgrade": True,
                "plugins.always_open_pdf_externally": True,
            },
        )
        browser1 = webdriver.Chrome(
            os.path.join(
                os.path.abspath(""),
                "../../../work/git_repos/EMA/equipment-advisor-diagnosis-model-generation/data/scrapingTools/chromedriver",
            ),
            options=chrome_options,
        )
        for a1 in range(len(allpapers)):
            a = allpapers[a1]
            pmeta = dict()
            try:
                pmeta["url"] = (
                    domain + a.find("div", class_="title").find_next().attrs["href"]
                )
                pmeta["pmc_id"] = a.find("dl", class_="rprtid").contents[1].contents[0]
                pmeta["title"] = a.find("div", class_="title").find_next().contents[0]
                pmeta["authors"] = (
                    a.find("div", class_="supp").find("div", class_="desc").contents[0]
                )
            except Exception as e:
                logger.error(str(e))
            meta["articles"].append(pmeta)
            try:
                alllinks = a.find("div", class_="links")
                for a in alllinks.find_all("a", class_="view"):
                    if a.contents[0].startswith("PDF"):  # pdf full text link
                        link = domain + a.attrs["href"]
                        if not os.path.exists(
                            os.path.join(
                                os.path.abspath(""),
                                "corpus",
                                meta["issue"].strip(": "),
                                link.split("/")[-1],
                            )
                        ):
                            logger.info("saving paper from " + str(link))
                            browser1.get(link)
                            while True:
                                time.sleep(1)
                                if os.path.exists(
                                    os.path.join(
                                        os.path.abspath(""),
                                        "corpus",
                                        meta["issue"].strip(": "),
                                        link.split("/")[-1],
                                    )
                                ):
                                    logger.info("saved paper")
                                    break
                        else:
                            logger.info("paper already exists in repo")
                            pass
            except Exception as e:
                logger.error("error occurred when downloading paper")
                logger.error(str(e))
            bar.update(a1)
        issuesMetadata.append(meta)
        time.sleep(30)  # wait for the last document to be downloaded
        browser1.quit()
        barall.update(i1)
    return issuesMetadata

In [None]:
s = getFullTextHBM("hbmissues.html")

[ 2020-08-09 16:34:59,130 MainProcess <ipython-input-156-46aac1fbfe7e> getFullTextHBM 34 INFO ] downloading issue v.41(1):  
[ 2020-08-09 16:34:59,131 MainProcess <ipython-input-156-46aac1fbfe7e> getFullTextHBM 35 INFO ] {'issue': 'v.41(1): ', 'url': 'https://www.ncbi.nlm.nih.gov/pmc/issues/358883/', 'time': '2020 Jan', 'articles': []} 
[ 2020-08-09 16:35:01,263 MainProcess <ipython-input-156-46aac1fbfe7e> getFullTextHBM 100 INFO ] paper already exists in repo 
[ 2020-08-09 16:35:01,267 MainProcess <ipython-input-156-46aac1fbfe7e> getFullTextHBM 100 INFO ] paper already exists in repo 
[ 2020-08-09 16:35:01,271 MainProcess <ipython-input-156-46aac1fbfe7e> getFullTextHBM 100 INFO ] paper already exists in repo 
[ 2020-08-09 16:35:01,276 MainProcess <ipython-input-156-46aac1fbfe7e> getFullTextHBM 100 INFO ] paper already exists in repo 
[ 2020-08-09 16:35:01,281 MainProcess <ipython-input-156-46aac1fbfe7e> getFullTextHBM 100 INFO ] paper already exists in repo 
[ 2020-08-09 16:35:01,286 

PDF parsing using pdfbox