In [8]:
import pandas as pd
import requests
from datetime import date, timedelta 

In [2]:
import subprocess
def get_git_user():
    res = subprocess.run(["git", "config", "user.email"], stdout=subprocess.PIPE)
    return res.stdout.strip().decode()

In [3]:
journals_list = pd.read_csv("../../reviewerSelection-data/JCR_JournalResults_Sociology.csv")
print(len(journals_list))
journals_list.head()

214


Unnamed: 0,Journal name,JCR Abbreviation,ISSN,eISSN,Category,Total Citations,2022 JIF,JIF Quartile,2022 JCI,% of OA Gold
0,ANNALS OF TOURISM RESEARCH,ANN TOURISM RES,0160-7383,1873-7722,SOCIOLOGY - SSCI,19874,13.2,Q1,3.52,17.21%
1,Annual Review of Sociology,ANNU REV SOCIOL,0360-0572,1545-2115,SOCIOLOGY - SSCI,15214,10.5,Q1,4.3,1.15%
2,AMERICAN SOCIOLOGICAL REVIEW,AM SOCIOL REV,0003-1224,1939-8271,SOCIOLOGY - SSCI,25619,9.1,Q1,4.98,13.91%
3,EUROPEAN SOCIETIES,EUR SOC,1461-6696,1469-8307,SOCIOLOGY - SSCI,2158,8.1,Q1,2.54,34.97%
4,SOCIOLOGICAL METHODS & RESEARCH,SOCIOL METHOD RES,0049-1241,1552-8294,SOCIOLOGY - SSCI,8840,6.3,Q1,3.18,15.87%


In [4]:
def extract_works_relevant(list_relevant):
    '''Returns dataframe of all works from OpenAlex from all journals in list_relevant.'''
    user_mail = get_git_user()
    current_date = date.today()
    year_limit = 30
    date_limit = current_date - timedelta(days = year_limit * 365) 

    total_list = []
    for issn in list_relevant['ISSN']:
        
        curr_res_list = []
        cursor = '*'
        select = """id,authorships,publication_year,display_name,primary_location,concepts, 
        cited_by_count,language,referenced_works,related_works,abstract_inverted_index"""
        
        while cursor != None:
            filters = f'primary_location.source.issn:{issn},from_publication_date:{date_limit},to_publication_date:{current_date}'
            params = {
                'filter': filters,
                'per_page': 200,
                'cursor': cursor,
                'select': select,
                'mailto': user_mail
            }
            res = requests.get("https://api.openalex.org/works", params=params).json()
            for work in range(len(res['results'])): 
                curr_res = res['results'][work]
                curr_res_list.append(curr_res)
            cursor = res['meta'].get('next_cursor', None)
            print(len(curr_res_list))

        total_list.append(curr_res_list)
        print(len(total_list))
    
    return total_list 

In [5]:
all_sociology_works = extract_works_relevant(journals_list)  

200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3678
3678
1
200
400
600
792
792
2
200
400
600
800
1000
1200
1400
1600
1764
1764
3
200
400
600
800
972
972
4
200
400
600
800
978
978
5
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3369
3369
6
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2461
2461
7
200
218
218
8
200
400
600
772
772
9
200
400
600
800
1000
1042
1042
10
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5446
5446
11
200
400
600
800
1000
1200
1400
1600
1800
2000
2105
2105
12
200
400
600
694
694
13
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6422
6422
14
200
400
600
687
687
15
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2599
2599
16
200
400
600
800
1000
1103
1103
17
200
233
233
18
200
400
600
800
10

In [6]:
len(all_sociology_works)  # 214 journals 

214

In [1]:
import pickle 

#with open('all_sociology_works_morecols.pkl', 'wb') as f:
#    pickle.dump(all_sociology_works, f)
        
with open('../raw_data/all_sociology_works_morecols.pkl', 'rb') as f:
    all_sociology_works = pickle.load(f)

In [2]:
len(all_sociology_works)

214

In [3]:
all_sociology_works = [entry for lists in all_sociology_works for entry in lists]

In [4]:
print(all_sociology_works[0].keys())
len(all_sociology_works)

dict_keys(['id', 'authorships', 'publication_year', 'display_name', 'primary_location', 'concepts', 'cited_by_count', 'language', 'referenced_works', 'related_works', 'abstract_inverted_index'])


268998

In [5]:
def create_abstract_from_inverted(inverted_abstract): 
    abstracts = []
    for entry in inverted_abstract:
        word_index = []
        if entry["abstract_inverted_index"]:
            for k,v in entry["abstract_inverted_index"].items(): 
                for index in v: word_index.append([k,index])    
            abstracts.append(' '.join([word[0] for word in sorted(word_index,key = lambda x : x[1]) ])) # join words with a space 
        else: 
            abstracts.append("None")
    return abstracts 

#abstracts = create_abstract_from_inverted(all_sociology_works)
#abstracts[:10]

In [6]:
def create_ref_csv(all_sociology_works):
    """
    turns dictionary of all sociology works from openalex into a df containing:
    ids, year, language, journal_issn, journal_name, authors, 
                 referenced_works, related_works, abstracts
    """
    
    ids = [w["id"].split("/")[-1] for w in all_sociology_works]
    abstracts = create_abstract_from_inverted(all_sociology_works)
    year = [w["publication_year"] for w in all_sociology_works]
    language = [w["language"] for w in all_sociology_works]
    referenced_works = [w["referenced_works"] for w in all_sociology_works]
    related_works = [w["related_works"] for w in all_sociology_works]
    
    concepts_all = [w["concepts"] for w in all_sociology_works]
    concepts = []
    if concepts_all:
        for concept in concepts_all:
            concepts_per_paper = []
            for info in concept:
                concepts_per_paper.append({"display_name":info["display_name"],
                                "level":info["level"], 
                                "score":info["score"]})
            concepts.append(concepts_per_paper)

    journal_issn = []
    journal_name = []
    for w in all_sociology_works:
        if w["primary_location"]:
            if w["primary_location"]["source"]:
                if w["primary_location"]["source"]["issn_l"]:
                    journal_issn.append(w["primary_location"]["source"]["issn_l"])
                if w["primary_location"]["source"]["display_name"]:
                    journal_name.append(w["primary_location"]["source"]["display_name"])


    authorships = [w["authorships"] for w in all_sociology_works]
    authors = []
    author_institutions = []
    for paper in authorships:
        authors_per_paper = []
        author_institution_per_paper = []
        for w in paper:
            if w["author"]:
                if w["author"]["display_name"]:
                    authors_per_paper.append(w["author"]["display_name"])
                if w["institutions"]:
                    for institution in range(len(w["institutions"])): 
                        author_institution_per_paper.append(w["institutions"][institution]["display_name"])
                elif not w["institutions"]:
                    author_institution_per_paper.append("NA")

        authors.append(authors_per_paper)
        author_institutions.append(author_institution_per_paper)

    return pd.DataFrame(
        list(zip(ids, year, language, journal_issn, journal_name, authors, 
                 author_institutions, concepts, referenced_works, related_works, abstracts)), 
             columns = ["oa_id", "year", "language", "journal_issn", "journal_name", 
                        "authors", "author_institutions", "concepts", 
                        "referenced_works", "related_works", "abstracts"]) 

In [9]:
all_sociology_works_df = create_ref_csv(all_sociology_works)
all_sociology_works_df[:3]

Unnamed: 0,oa_id,year,language,journal_issn,journal_name,authors,author_institutions,concepts,referenced_works,related_works,abstracts
0,W2060041453,1999,fr,0160-7383,Annals of Tourism Research,"[Seyhmus Baloglu, Ken W. McCleary]","[University of Nevada, Las Vegas, Virginia Tech]","[{'display_name': 'Destination image', 'level'...","[https://openalex.org/W1551659116, https://ope...","[https://openalex.org/W642833299, https://open...",Image has been shown to be an important influe...
1,W2137040040,1999,en,0160-7383,Annals of Tourism Research,[Ning Wang],[Zhongshan Hospital],"[{'display_name': 'Tourism', 'level': 2, 'scor...","[https://openalex.org/W12139949, https://opena...","[https://openalex.org/W232595385, https://open...",Abstract This paper aims at a conceptual clari...
2,W2153383537,2000,en,0160-7383,Annals of Tourism Research,"[Dwayne Baker, John L. Crompton]","[NA, NA]","[{'display_name': 'Psychology', 'level': 0, 's...","[https://openalex.org/W1495489389, https://ope...","[https://openalex.org/W242256928, https://open...",Performance quality was conceptualized as the ...


In [10]:
all_sociology_works_df.shape

(268998, 11)

In [11]:
all_sociology_works_df["merged_refs"] = all_sociology_works_df["referenced_works"] + all_sociology_works_df["related_works"]

refs_all = []
for row in all_sociology_works_df["merged_refs"]:
    refs_short = []
    for url in row: 
        refs_short.append((url.split("/")[-1] ))
        
    refs_all.append(refs_short)
all_sociology_works_df["works_referenced_related"] = refs_all 

all_sociology_works_df.shape 

(268998, 13)

In [14]:
all_sociology_works_df.drop(columns = ["merged_refs"], inplace = True)
all_sociology_works_df.shape

(268998, 12)

In [17]:
all_sociology_works_df["works_referenced_related"][0][1]

'W1554035063'

In [22]:
print(type(all_sociology_works_df["works_referenced_related"][0]))

<class 'list'>


In [23]:
from ast import literal_eval

In [21]:
## refactor the works_referenced_related column so that it is a list and not a string 

print(type(all_sociology_works_df["works_referenced_related"][0]))
#all_sociology_works_df["works_referenced_related"] = all_sociology_works_df["works_referenced_related"].apply(lambda x: x.replace('[', '').replace(']', '').replace(',', '').replace("'", "").split(' ')) 
#print(type(all_sociology_works_df["works_referenced_related"][0]))
all_sociology_works_df.shape

<class 'list'>


(268998, 12)

In [19]:
all_sociology_works_df.to_csv(path_or_buf="../raw_data/all_sociology_works_morecols_new.csv", index = False) 