### Introduction
This notebook fetches information from Beall's list to find the URL and name of predatory journals, which will be tagged red and yellow flags for scholarly sources

In [11]:
# Import all the models
from bs4 import BeautifulSoup
import requests
import mwparserfromhell
import tldextract

In [12]:
# scrape the Beall's List website for lists of journals. Original list contains red flag journals, while Updated list contains green flag journals
response = requests.get("https://beallslist.net")
soup = BeautifulSoup(response.content, 'html.parser')
all_list = soup.find_all("ul")
for ele in all_list:
    title = ele.find_previous("h2")
    if title:
        if title.text=="Update":
            update = ele
        elif title.text == "Original list":
            original = ele
        else:
            continue

In [13]:
# Create yellow_flags_dict, which has the schema key:value = journal_name: journal_link
from collections import defaultdict
yellow_flags = update.find_all("li")
yellow_flags_dict = defaultdict(dict)
for ele in yellow_flags:
    publication = ele.find("a")
    name, link = publication.text, publication["href"]
    yellow_flags_dict[name] = link

In [14]:
# Create red_flags_dict, which has the schema key:value = journal_name: journal_link

red_flags = original.find_all("li")
red_flags_dict = defaultdict(dict)
for ele in red_flags:
    publication = ele.find("a")
    link = ""
    if publication:
        name, link = publication.text, publication["href"]
        red_flags_dict[name] = link

In [15]:
#reverse the yellow_flag_dict and red_flags_dict to create dictionaries whose schema is key:value = domain of link : journal name
red_flags_reverse = defaultdict(list)
yellow_flags_reverse = defaultdict(list)
for key, val in red_flags_dict.items():
    val_domain = tldextract.extract(val).domain
    red_flags_reverse[val_domain].append(key)

for key, val in yellow_flags_dict.items():
    val_domain = tldextract.extract(val).domain
    yellow_flags_reverse[val_domain].append(key)

### Fetch green-flag journals

Fetch journals enlisted on DOAJ, which is more likely to have ethical practices for open sourced papers.


In [16]:
import pandas as pd

doaj = pd.read_csv("doaj.csv")
doaj

Unnamed: 0,Journal title,Journal URL,URL in DOAJ,When did the journal start to publish all content using an open license?,Alternative title,Journal ISSN (print version),Journal EISSN (online version),Keywords,Languages in which the journal accepts manuscripts,Publisher,...,URL for journal's Open Access statement,Continues,Continued By,LCC Codes,Subjects,DOAJ Seal,Added on Date,Last updated Date,Number of Article Records,Most Recent Article Added
0,Anais da Academia Brasileira de Ciências,http://www.scielo.br/scielo.php?script=sci_ser...,https://doaj.org/toc/ed09859a464f4461b1af34279...,2000.0,Annals of the Brazilian Academy of Sciences,0001-3765,1678-2690,"biological sciences, exact and earth sciences,...",English,Academia Brasileira de Ciências,...,http://www.scielo.br/revistas/aabc/isubscrp.htm,,,Q,Science,No,2004-04-23T21:31:00Z,2017-01-04T14:19:54Z,3913,2023-03-28T07:31:32Z
1,ACME,http://riviste.unimi.it/index.php/ACME,https://doaj.org/toc/b1ca04ba56194f29a362b3eef...,2014.0,,0001-494X,2282-0035,"italian literature, classic literature, lingui...",Italian,Università degli Studi di Milano,...,http://riviste.unimi.it/index.php/ACME/about/e...,,,A,General Works,No,2014-12-22T19:55:58Z,2022-08-10T19:13:04Z,224,2023-03-16T15:14:11Z
2,Acta Biochimica Polonica,https://ojs.ptbioch.edu.pl/index.php/abp,https://doaj.org/toc/eea321d7f4c34a59be8fd1203...,2000.0,,0001-527X,1734-154X,"molecular biology, biophysics, bioinformatics,...",English,Polish Biochemical Society,...,https://ojs.ptbioch.edu.pl/index.php/abp/about,,,QD415-436|QH426-470,Science: Chemistry: Organic chemistry: Biochem...,No,2022-05-23T10:09:09Z,2022-05-23T10:09:09Z,0,
3,Acta Dermato-Venereologica,https://medicaljournalssweden.se/actadv,https://doaj.org/toc/ffde9666ab1d46f1a8c688ce6...,2017.0,,0001-5555,1651-2057,"sexually transmitted infections, psoriasis, ps...",English,Medical Journals Sweden,...,https://medicaljournalssweden.se/actadv/oapolicy,,,RL1-803,Medicine: Dermatology,No,2011-11-10T12:31:05Z,2023-01-19T09:14:05Z,1581,2023-03-30T09:20:23Z
4,Acta Médica Costarricense,http://actamedica.medicos.cr/index.php/Acta_Me...,https://doaj.org/toc/a5919aee5ad2413a89cf32df0...,2019.0,,0001-6012,2215-5856,"medicine, public health, medical sciences, health","Spanish, English",Colegio de Médicos y Cirujanos de Costa Rica,...,http://actamedica.medicos.cr/index.php/Acta_Me...,,,R,Medicine,No,2020-12-22T11:08:24Z,2022-07-29T11:54:21Z,1207,2015-12-08T15:06:43Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,Football(s),https://preo.u-bourgogne.fr/football-s/,https://doaj.org/toc/ea91a02a465e4dbf8386b899a...,2022.0,"Football(s): Histoire, Culture, Économie, Société",2967-0837,2968-0115,"history, study of politics, cultural history, ...",French,Presses Universitaires de Franche-Comté (PUFC),...,https://preo.u-bourgogne.fr/football-s/index.p...,,,GV557-1198.995|HC10-1085,Geography. Anthropology. Recreation: Recreatio...,No,2023-01-14T13:39:37Z,2023-01-14T13:39:37Z,0,
19154,"International Journal of Home Economics, Hospi...",https://www.ijhhr.org/,https://doaj.org/toc/826a1b161368467cad3ac6190...,2022.0,IJHHR,2971-5121,,"home economics, hospitality management, allied...",English,Department of Home Economics & Hospitality Man...,...,https://www.ijhhr.org/open-access-policy/,,,TX1-1110,Technology: Home economics,No,2022-11-13T20:06:16Z,2022-11-19T14:23:23Z,42,2023-01-01T16:55:03Z
19155,Papireto,https://papireto.accademiadipalermo.it/,https://doaj.org/toc/6f40c9737dfb4eab8881600cb...,2022.0,Papireto: Rivista Scientifica Online,,2974-668X,"history of art, anthropology, aesthetics, arch...","Italian, English, Spanish, French",Dipartimento di Comunicazione e Didattica dell...,...,https://papireto.accademiadipalermo.it/info/,,,NX440-632|CC1-960,Fine Arts: Arts in general: History of the art...,No,2023-03-23T17:17:20Z,2023-03-23T17:17:20Z,16,2023-03-23T21:46:16Z
19156,Advances in Civil and Architectural Engineering,https://hrcak.srce.hr/ojs/index.php/acae/index,https://doaj.org/toc/e366d1aee172433188fda8938...,2022.0,,,2975-3848,"concrete structures, construction materials, h...",English,"Josip Juraj Strossmayer University of Osijek, ...",...,https://hrcak.srce.hr/ojs/index.php/acae/copy,1847-8948,,TA1-2040,Technology: Engineering (General). Civil engin...,No,2022-11-21T11:18:02Z,2022-11-21T11:18:02Z,6,2023-01-04T12:01:11Z


In [17]:
#create an extra column that extract the journal's domain,
# which will be used in the reverse dictionary
doaj["journal_domain"] = doaj["Journal URL"].apply(lambda x: tldextract.extract(x).domain)
doaj["journal_domain"]

0                       scielo
1                        unimi
2                      ptbioch
3        medicaljournalssweden
4                      medicos
                 ...          
19153              u-bourgogne
19154                    ijhhr
19155       accademiadipalermo
19156                     srce
19157           tsunamisociety
Name: journal_domain, Length: 19158, dtype: object

In [18]:
green_flags_dict = pd.Series(doaj["Journal URL"].values,index=doaj["Journal title"]).to_dict()
green_flags_reverse = pd.Series(doaj["Journal title"].values, index=doaj["journal_domain"]).to_dict()

In [19]:
# see the result to do a smoke test
green_flags_reverse

{'scielo': 'Revista Uruguaya de Medicina Interna',
 'unimi': 'Dissertation Nursing',
 'ptbioch': 'Acta Biochimica Polonica',
 'medicaljournalssweden': 'Journal of Rehabilitation Medicine - Clinical Communications',
 'medicos': 'Acta Médica Costarricense',
 'pbsociety': 'Acta Agrobotanica',
 'elsevier': 'Journal of Orthopaedic Reports',
 'srce': 'Advances in Civil and Architectural Engineering',
 'sagepub': 'Video Journal of Sports Medicine',
 'ufba': 'Revista Direito e Sexualidade',
 'revistaalergia': 'Revista Alergia México',
 'ashs': 'HortScience',
 'ceon': 'Bulletin of Natural Sciences Research',
 'ul': 'ESTUDIO',
 'allenpress': 'Innovations in Digital Health, Diagnostics, and Biomarkers',
 'iaepan': 'Ethnologia Polona',
 'archives-animal-breeding': 'Archives Animal Breeding',
 'csic': 'Disparidades',
 'sup': 'Archivos de Pediatría del Uruguay',
 'uco': 'Seriarte',
 'alanrevista': 'Archivos Latinoamericanos de Nutrición',
 'ugr': 'Theory Now',
 'aoa': 'Revista de la Asociación Odont

In [20]:
import json

scholarly_flags = {
    "red_scholarly": red_flags_dict,
    "red_scholarly_reverse": red_flags_reverse,
    "yellow_scholarly": yellow_flags_dict,
    "yellow_scholarly_reverse": yellow_flags_reverse,
    "green_scholarly": green_flags_dict,
    "green_scholarly_reverse": green_flags_reverse
}

with open("scholarly_flags.json", "w") as outfile:
    json.dump(scholarly_flags, outfile)
    