## Getting predatory journals

We follow Beall's list [here](https://beallslist.net).

Two sections:
- original list: def red flag
- Updated list: May be predatory -> yellow flag


problem with Beall's list: 82% accuracy, based on the trial of a Science correspondent. 

In [69]:
from bs4 import BeautifulSoup
import requests
import mwparserfromhell
import tldextract

In [6]:
response = requests.get("https://beallslist.net")
soup = BeautifulSoup(response.content, 'html.parser')
all_list = soup.find_all("ul")
for ele in all_list:
    title = ele.find_previous("h2")
    if title:
        if title.text=="Update":
            update = ele
        elif title.text == "Original list":
            original = ele
        else:
            continue

In [None]:
original

In [8]:
from collections import defaultdict
yellow_flags = update.find_all("li")
yellow_flags_dict = defaultdict(dict)
for ele in yellow_flags:
    publication = ele.find("a")
    name, link = publication.text, publication["href"]
    yellow_flags_dict[name] = {"link": link, "acronym": ""}

In [9]:
len(yellow_flags_dict)

163

In [10]:
red_flags = original.find_all("li")
red_flags_dict = defaultdict(dict)
for ele in red_flags:
    publication = ele.find("a")
    if publication:
        name, link = publication.text, publication["href"]
        red_flags_dict[name] = {"link": link, "acronym": ""}
    else:
        # Only 3 journals didn't have a hyperlink
        red_flags_dict[ele.name] = {"link": "", "acronym": ""}
        
    

In [11]:
#one "" different from """"""
test = "American Research Journals"
if test in red_flags_dict:
    print(red_flags_dict[test])


{'link': 'http://www.arjonline.org/', 'acronym': ''}


In [82]:
# TEST
ext = tldextract.extract('https://web.archive.org/web/20160304232400/http://www.bjournal.co.uk/paper/BJASS_11_1/BJASS_11_01_03.pdf')
ext.domain

'archive'

## Get the type of citation

In [None]:
pip install tldextract

In [None]:
pip install bibtexparser

In [249]:
# TEST
from datetime import datetime 

date = datetime.strptime("2033-jun", '%Y-%b')
print(date)

2033-06-01 00:00:00


In [55]:
import bibtexparser
from dateutil import parser
import json

def get_metainfo_doi(doi):
    """Input: doi string
    Output: the journal name and date published of the article. Return None for each value if the can't parsed
    """
    res = requests.get("http://dx.doi.org/"+ doi, headers={"Accept": "application/x-bibtex"})
    res = res.content.decode('utf-8')
    bibtext = bibtexparser.loads(res).entries
    if len(bibtext) >0 :
        journal = bibtext[0]["journal"].strip() if "journal" in bibtext[0] else None
        time_published = ""

        if "year" in bibtext[0]:
            time_published += bibtext[0]["year"]
        if "month" in bibtext[0]:
            time_published += " " + bibtext[0]["month"]
        if "day" in bibtext[0]:
            time_published += " " + bibtext[0]["day"]
        if len(time_published) > 0:
            time_published = parser.parse(time_published)
        else:
            time_published = None

        return journal, time_published
    else:
        return None, None


def get_metainfo_pmc(pmc):
    res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id="+pmc+"&retmode=json")
    res = res.content.decode("utf-8")
    res = json.loads(res)
    data = res["result"][pmc]
    journal, time_published = None, None
    if "error" in data:
        return None, None
    else:
        journal = data["fulljournalname"].strip()
        time_published = parser.parse(data["pubdate"])
        return journal, time_published


def get_metainfo_pmid(pmid):
    res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json")
    res = res.content.decode("utf-8")
    res = json.loads(res)
    data = res["result"][pmid]
    journal, time_published = None, None
    if "error" in data:
        return None, None
    else:
        journal = data["fulljournalname"].strip()
        time_published = parser.parse(data["pubdate"])
        return journal, time_published


In [56]:
get_metainfo_doi("10.1038/nrd842")
# get_metainfo_pmc("33829083") #supposed to show error
# get_metainfo_pmc("8008929")
# get_metainfo_pmid("8008929")
# get_metainfo_pmid("25266471")

('Nature Reviews Drug Discovery', datetime.datetime(2002, 7, 25, 0, 0))

In [63]:
import requests
import tldextract
from collections import defaultdict

        
def parse_html(page_url):
  """ This function parse metadata of citations from HTML tag.
    Input: wiki_url
    Output: a parsed citation list from HTML. Each citation has format key: value
        key: the text version of all citation
        value: a dictionary with schema 
            {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}"""
  
  citation_types = {'web', 'journal', 'book', 'conference', 'news'}
  all_parsed_citations = defaultdict(dict)
  response = requests.get(page_url)
  soup = BeautifulSoup(response.content, 'html.parser')

  # Get all the references
  all_ref = []
  ordered_ref_lst = soup.find_all("ol", {"class": "references"})
  for each in ordered_ref_lst:
    refs = each.find_all("li")
    all_ref += refs

  for ele in all_ref:
    #Check if it has <span class="reference text">
    ref = ele.find("span", {"class":"reference-text"})
    
    source_type = "other" #first default value for source_type

    if ref:
      # TASK: get all essential information from citation tag
      citation_key = ref.get_text()
      hyperlink = ref.find("a", {"class": "external text"})
      external_link = hyperlink["href"] if hyperlink else None

      # TASK: find source type, ie whether it's 'web', 'journal', 'book', 'conference', 'news'
      cite_tag = ref.find("cite") 
      if cite_tag:
        for class_tag in cite_tag["class"]:
          if class_tag in citation_types:
            source_type = class_tag      
            break
          
      # TASK: get publisher (journal name for journal or conference, domain website for webs, 'work' for news)
      #for journal, conference, others look for DOI or PMID or PMC
      if source_type in {'journal', 'conference', 'other'}:
        has_doi = ref.find("a", {"title": "Doi (identifier)"})
        has_pmc = ref.find("a", {"title": "PMC (identifier)"})
        has_pmid = ref.find("a", {"title": "PMID (identifier)"})
        journal, date = None, None
        if has_doi:
          doi = has_doi.find_next("a", {"class": "external text"})
          journal, date = get_metainfo_doi(doi.text)
        elif has_pmc:
          pmc = has_pmc.find_next("a", {"class": "external text"})
          journal, date = get_metainfo_pmc(pmc.text)
        elif has_pmid:
          pmid = has_pmid.find_next("a", {"class": "external text"})
          journal, date = get_metainfo_pmid(pmid.text)         
        
        all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": journal, "date": date}

      # for news, web, other that hasn't been parsed, publisher is the domain of the website 
      elif source_type in {'news', 'web', 'other'}:
        publisher = tldextract.extract(external_link).domain if external_link else None
        all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": publisher, "date": None}
        

  return all_parsed_citations




In [64]:
# test_sol_ac = parse_html("https://en.wikipedia.org/wiki/Solanum_acaule")
test_dem_albania = parse_html("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania")
#test_opth = parse_html("https://en.wikipedia.org/wiki/Optical_coherence_tomography")

In [65]:
test_dem_albania

defaultdict(dict,
            {'"European Election Watch Albania". Center for Strategic and International Studies. Retrieved 4 January 2022.': {'external_link': 'https://www.csis.org/programs/european-election-watch/albania',
              'type': 'web',
              'html_tag': <li id="cite_note-CSIS-1"><span class="mw-cite-backlink">^ <a href="#cite_ref-CSIS_1-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-CSIS_1-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-CSIS_1-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-CSIS_1-3"><sup><i><b>d</b></i></sup></a></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1133582631">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free a,.mw-parser-output .citation .cs1-lock-free a{background:url("//upload.wikimedia.org/wikipedia/co

## Match with WikiText

In [None]:
pip install mwparserfromhell

In [62]:
import re
from difflib import SequenceMatcher
from dateutil import parser

def parse_match_wikitext(wiki_url):
    """
    This function parse wikitext version of the citations, match it with the HTML version, 
    and extract more information, such as publisher and date that weren't extracted on the HTML.
    Input: wiki_url
    Output: a fully parsed citation list. Each citation has format key: value
        key: the text version of all citation
        value: a dictionary with schema 
            {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
    """
    parsed_citation = parse_html(wiki_url)
    print("ALL citation", len(parsed_citation))
    wiki_page = wiki_url.split("wiki/")[1]
    url = "https://en.wikipedia.org/w/index.php?title=" + wiki_page +"&action=raw"
    response = requests.get(url)
    text = response.text
    wikicode = mwparserfromhell.parse(text)
    
    all_found = 0

    # Create a copy of not fully parsed citation, ie one that lacks publisher or date param
    not_fully_parsed = defaultdict(dict)
    for key, val in parsed_citation.items():
        if not val["publisher"] or not val["date"]:
            not_fully_parsed[key] = val

    for tpl in wikicode.filter_templates(matches="{{cite"):
        #tpl is template, for a template in wikitext
        found_match = None
        
        # Match on external link:
        if tpl.has_param("url"):
            external_url = tpl.get("url").split("=")[1]
            for key, val in not_fully_parsed.items():
                if val["external_link"]:
                    if val["external_link"].strip() == external_url.strip():
                        found_match = key
                        all_found += 1
                        break
        # if not found match by URL, find by title
        if not found_match:
            if tpl.has_param("title"):
                #Get the title of citation in without formatting text
                title = tpl.get("title").split("=")[1] 
                title = re.sub('[^A-Za-z0-9 ]+', '', title) #filter out extra formatting
                for key in not_fully_parsed.keys():
                    if title in key:
                        found_match = key
                        all_found += 1
                        break
        
        if found_match:
            # Fetch publisher/ journal name from wikitext
            if not parsed_citation[found_match]["publisher"]:
                publisher = None
                if tpl.has_param("journal"): #for journal name
                    publisher = tpl.get("journal").split("=")[1] 
                elif tpl.has_param("publishder"): #for website or book publisher
                    publisher = tpl.get("publisher").split("=")[1]
                elif tpl.has_param("work"): #for news/ magazine name
                    publisher = tpl.get("work").split("=")[1]
                
                if publisher:
                    publisher = re.sub('[^A-Za-z0-9 ]+', '', publisher)
                parsed_citation[found_match]["publisher"] = publisher

            # Fetch publication date from wikitext
            if not parsed_citation[found_match]["date"]:
                date = None
                if tpl.has_param("date"):
                    date = tpl.get("date").split("=")[1]
                    if len(date) >= 4: #at least 4 digits for year, or yy-mm format
                        date = parser.parse(date)
                parsed_citation[found_match]["date"] = date
            

    print("TOTAL MATCH: ", all_found)
    return parsed_citation
        

def check_source_quality(wiki_url):
    parsed = parse_match_wikitext(wiki_url)
    red_flag_list = []
    yellow_flag_list = []

    for citation in parsed:
        # Check for journals/ conference/ other
        if citation["type"] in {"journal", "conference", "other"}:
            # Check on the domain of external link
            if citation["external_link"]:
                domain = tldextract.extract(citation["external_link"]).domain
                if domain 
            # Check on the journal name
        



        #TODO: Find the journal name -> check if it's in red or yellow list
            # TODO: split it to a different support function
        # if tpl.has("journal"):
        #     journal = tpl.get("journal").split("=")[1]
        #     if journal == "American Research Journals":
        #         print(tpl)
        #     if journal in red_flags_dict.keys():
        #         print("RED FLAG: ", journal)
        #     if journal in yellow_flags_dict.keys():
        #         print("YELLO FLAG: ", journal)
parsed = parse_match_wikitext("https://en.wikipedia.org/wiki/Optical_coherence_tomography")

ALL citation 102
TOTAL MATCH:  14


In [36]:
# TEST
parser.parse("23 March 2022 ")

datetime.datetime(2022, 3, 23, 0, 0)

In [133]:
"Cyst Nematode and its Impacts on Soybean and Potato: A Review" in """Rehman, Fazal UR (30 April 2021). "Cyst Nematode and its Impacts on Soybean and Potato: A Review". Acta Scientific Biotechnology. 2 (2): 17–22. Retrieved 13 November 2021."""

True

In [190]:
a = mwparserfromhell.parse("{{cite |last1=Rehman |first1=Fazal UR |date=30 April 2021 |title=Cyst Nematode and its Impacts on Soybean and Potato: A Review |url=https://www.researchgate.net/publication/351345007 |journal=Acta Scientific Biotechnology |volume=2 |issue=2 |pages=17–22 |doi= |access-date=13 November 2021 }}").filter(matches="{{cite")
for ele in a:
    pass
    # if ele.has("title"):
    #     title = ele.get("title").split("=")[1]
    #     print(title)


In [None]:
def evaluate_citation(wikiUrl):
    """Input: English Wikipedia URL 
    Output:
    - how many citations each type has
        - red-flag articles/ news/ website
        - yellow-flag articles/  news/ sites
        - green-flag artciles/ news/ sites
    - Summary stats about publication/ access date
    - summary dataviz on type distribution: pie chart
    """
    pass