In [1]:
# To zhuzhu

import re
import requests
import pandas as pd
import numpy as np
from time import sleep
import bs4
from bs4 import BeautifulSoup
import pickle # for loading a dictionary from disk
from typing import Optional # typehint that value can also be None
from dataprep.clean import validate_country
from dataprep.clean import clean_country



Note: NumExpr detected 10 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


In [94]:
#!pip install dataprep

In [110]:
import spacy
from spacy import displacy
import warnings
warnings.filterwarnings("ignore")

# Larger and slower pipeline but more accurate
# spacy.cli.download("en_core_web_trf")
# NER= spacy.load("en_core_web_trf")
                   
# Faster and smaller pipelien, less accurate                   
spacy.cli.download("en_core_web_sm")
NER= spacy.load("en_core_web_sm")

def extract_country_from_abstract(abs_string):
    text_ner = NER(abs_string)
    #displacy.render(text_ner,style="ent",jupyter=True)

    # Extract the text from each tag
    tag_list = np.array([X.text for X in text_ner.ents])
 #   tag_list_df = pd.DataFrame({ "country": tag_list })
  #  tag_list_clean = clean_country(tag_list_df, "country").country_clean.values
    GPE_mask = np.array([X.label_ =='GPE'  for X in text_ner.ents])
    LOC_mask= np.array([X.label_ =='LOC'  for X in text_ner.ents])
    mask = [validate_country(tag) for tag in tag_list]

    # Abstract_valid_country
    abs_country = tag_list[mask]
    #abs_GPE 
    GPE_list,LOC_list = [],[]
    if len(GPE_mask)>0:
      GPE_list = tag_list[GPE_mask]
    if len(LOC_mask)>0:
      LOC_list = tag_list[LOC_mask]
    return abs_country, GPE_list, LOC_list


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [111]:

def read_journal(url_list):
    res = pd.DataFrame(columns= ['DOI','Journal','Type','Title','Publish_Time','Issue_Time','Co_Author','Author','Rank','Nationality','Abs_country','Abs_GPE','Abs_LOC'])
    i=0
    for url in url_list:
        home_page = requests.get(url)
        raw_html = home_page.text
        soup = BeautifulSoup(raw_html, 'html.parser')
        
        journal = soup.find('div',{'class':"row crumbs-row"}).find_all('li',{'class':"page-breadcrumbs__item"})[2].find_all('span')[-1].get_text()
        # create a new row for each author
        main_content = soup.find('div',{'id':"maincontent"})
        title = main_content.find('h1').get_text()
        contributors_details = main_content.find_all('div',{'class':'row author'})
        published_date = main_content.find('div',{'class':"row published-date"}).find('strong').get_text()
        # Extract country information from abstract
        abs_tag = soup.find('div',{'class':'abstract-text-container'})
        if type(abs_tag)== bs4.element.Tag:
          abs_text = abs_tag.find('div',{'lang':'en'}).find('p').get_text()
          abs_country, GPE_list, LOC_list = extract_country_from_abstract(abs_text)
        else:
          abs_country, GPE_list, LOC_list = [],[],[]

        
        article_details = soup.find('dl',{'class':"article-details"})
        Type = article_details.find('div',{'class':'row'}).find('dd').get_text()
        issue_time = article_details.find('div',{'class':'content__journal'}).find_all('span')[-2].get_text().strip(',')

        doi = article_details.find('div',{'class':"doi-data"}).find('span').get_text()

        for author_describe_idx in range(len(contributors_details)):
            author_describe = contributors_details[author_describe_idx]
            temp_dict={}
            temp_dict['Co_Author'] = len(contributors_details) > 1
            temp_dict['Author']=author_describe.get('data-test-author')
            temp_dict['Rank'] = 'First' if (author_describe_idx == 0) else 'Communication' \
                                if (author_describe_idx == len(contributors_details)-1) else 'Other'
            temp_dict['Nationality'] = author_describe.find_all('span')[-1].get_text().split(',')[-1].strip()
            temp_dict['Publish_Time']= published_date
            temp_dict['Issue_Time']= issue_time
            temp_dict['Title']= title
            temp_dict['Journal']= journal
            temp_dict['DOI']= doi
            temp_dict['Type']= Type
            temp_dict['Abs_country'] = abs_country
            temp_dict['Abs_GPE'] = GPE_list
            temp_dict['Abs_LOC'] = LOC_list
            res = res.append(temp_dict, ignore_index=True)
        i+=1
        print(f'Processing the {i}th paper with {len(contributors_details)} authors')
    return res

In [105]:
url = 'https://www.cambridge.org/core/journals/american-antiquity/all-issues'
base_url = 'https://www.cambridge.org'

In [106]:
def Antiquity_issues_url(url,base_url): 
    issues_list= [] 
    home_page = requests.get(url)
    raw_html = home_page.text
    soup = BeautifulSoup(raw_html, 'html.parser')
    all_issues = soup.find('div',{'class':'reading-width journal-all-issues'})
    Time_range = ['decade2020','decade2010','decade2000']
    for dec in Time_range:
        panels = all_issues.find('div',{'id':dec}).find('ul',{'class':'accordion level second'}).find_all('li',recursive=False)
        for volume in panels:
            issues = volume.find('div',recursive=False).find_all('a')
            for issue in issues:
                issue_url = base_url + issue.get('href')
                issues_list.append(issue_url)
    return issues_list


In [107]:
issues_url = Antiquity_issues_url(url,base_url)

In [108]:
def Antiquity_papers_url(issues_url):
    url_list=[]
    i=0
    for url in issues_url:
        home_page = requests.get(url)
        raw_html = home_page.text
        soup = BeautifulSoup(raw_html, 'html.parser').find('div',{'class':'panel large-9 small-12 column main-column','id':'maincontent'})
        soup = soup.find('div',{'class':'results'}).find('div',{'class':'large-12 columns margin-top'})
        
        target = soup.find('h4',text ='Reports' )
        if target != None:
            for e in target.find_all_next():
                e.clear()
                
        target = soup.find('h4',text ='Report' )
        if target != None:
            for e in target.find_all_next():
                e.clear()
        '''
        
        
        elif soup.find('h4',text ='Front Cover (OFC, IFC) and matter' ) != None:
            target = soup.find('h4',text ='Front Cover (OFC, IFC) and matter' )
            for e in target.find_all_next():
                e.clear()
                
        elif soup.find('h4',text ='Back Cover (IBC, OBC) and matter' ) != None:
            target = soup.find('h4',text ='Back Cover (IBC, OBC) and matter' )
            for e in target.find_all_next():
                e.clear() 
        '''
                
        res = soup.find_all('div',{'class':'representation overview search'})
        for paper in res:
            try:
                add_url = paper.find('li',{'class':'title'}).find('a').get('href')
                url_list.append(base_url+add_url)
                i+=1
                print(f'Appeding the {i}th url')
            except:
                pass
    return url_list
        

In [101]:
url_list = Antiquity_papers_url(issues_url)

Appeding the 1th url
Appeding the 2th url
Appeding the 3th url
Appeding the 4th url
Appeding the 5th url
Appeding the 6th url
Appeding the 7th url
Appeding the 8th url
Appeding the 9th url
Appeding the 10th url
Appeding the 11th url
Appeding the 12th url
Appeding the 13th url
Appeding the 14th url
Appeding the 15th url
Appeding the 16th url
Appeding the 17th url
Appeding the 18th url
Appeding the 19th url
Appeding the 20th url
Appeding the 21th url
Appeding the 22th url
Appeding the 23th url
Appeding the 24th url
Appeding the 25th url
Appeding the 26th url
Appeding the 27th url
Appeding the 28th url
Appeding the 29th url
Appeding the 30th url
Appeding the 31th url
Appeding the 32th url
Appeding the 33th url
Appeding the 34th url
Appeding the 35th url
Appeding the 36th url
Appeding the 37th url
Appeding the 38th url
Appeding the 39th url
Appeding the 40th url
Appeding the 41th url
Appeding the 42th url
Appeding the 43th url
Appeding the 44th url
Appeding the 45th url
Appeding the 46th u

In [112]:
df=read_journal(url_list)

Processing the 1th paper with 0 authors
Processing the 2th paper with 1 authors
Processing the 3th paper with 3 authors
Processing the 4th paper with 4 authors
Processing the 5th paper with 2 authors
Processing the 6th paper with 2 authors
Processing the 7th paper with 6 authors
Processing the 8th paper with 5 authors
Processing the 9th paper with 2 authors
Processing the 10th paper with 1 authors
Processing the 11th paper with 1 authors
Processing the 12th paper with 4 authors
Processing the 13th paper with 5 authors
Processing the 14th paper with 2 authors
Processing the 15th paper with 5 authors
Processing the 16th paper with 3 authors
Processing the 17th paper with 0 authors
Processing the 18th paper with 3 authors
Processing the 19th paper with 3 authors
Processing the 20th paper with 5 authors
Processing the 21th paper with 3 authors
Processing the 22th paper with 2 authors
Processing the 23th paper with 2 authors
Processing the 24th paper with 10 authors
Processing the 25th pape

In [113]:
df.to_csv('American_Antiquity.csv')

In [114]:
df

Unnamed: 0,DOI,Journal,Type,Title,Publish_Time,Issue_Time,Co_Author,Author,Rank,Nationality,Abs_country,Abs_GPE,Abs_LOC
0,https://doi.org/10.1017/aaq.2021.120,American Antiquity,Forum,On Rehumanizing Pleistocene People of the West...,01 December 2021,April 2022,False,Bonnie L. Pitblado,First,USA (bonnie.pitblado@ou.edu),[],"[Folsom, New Mexico]",[the Western Hemisphere]
1,https://doi.org/10.1017/aaq.2021.115,American Antiquity,Article,Direct Evidence for Geophyte Exploitation in t...,02 November 2021,April 2022,True,Kaley Joyce,First,USA,[10],[Wyoming],"[the Wyoming Basin, the Wyoming Basin]"
2,https://doi.org/10.1017/aaq.2021.115,American Antiquity,Article,Direct Evidence for Geophyte Exploitation in t...,02 November 2021,April 2022,True,Lisbeth A. Louderback,Other,USA,[10],[Wyoming],"[the Wyoming Basin, the Wyoming Basin]"
3,https://doi.org/10.1017/aaq.2021.115,American Antiquity,Article,Direct Evidence for Geophyte Exploitation in t...,02 November 2021,April 2022,True,Erick Robinson,Communication,USA,[10],[Wyoming],"[the Wyoming Basin, the Wyoming Basin]"
4,https://doi.org/10.1017/aaq.2021.119,American Antiquity,Article,"Regional Conflict, Ceramic Senescence, and Paw...",02 December 2021,April 2022,True,Margaret E. Beck,First,USA,[],[Kitkahahki Town],[the Central Great Plains]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1350,https://doi.org/10.2307/2694057,American Antiquity,Articles,"Questions of Evidence, Legitimacy, and the (Di...",20 January 2017,April 2000,False,Alison Wylie,First,MO 63130,[],[],[]
1351,https://doi.org/10.2307/2694058,American Antiquity,Articles,Archaeology and Native North American Oral Tra...,20 January 2017,April 2000,False,Ronald J. Mason,First,WI 54912-0599,[],[],[]
1352,https://doi.org/10.2307/2694059,American Antiquity,Articles,Ancient History in the New World: Integrating ...,20 January 2017,April 2000,False,Roger C. Echo-Hawk,First,Co 80204-2788,[],[North Dakota],[North America]
1353,https://doi.org/10.2307/2694805,American Antiquity,Forum,Gazing upon the Invisible: Women and Children ...,20 January 2017,January 2000,False,Connie H. Nobles,First,LA 70402,[],[Gordon 1980],[]
