In [1]:
import pandas as pd
import requests
from datetime import date, timedelta 
from slugify import slugify 

In [2]:
import subprocess
def get_git_user():
    res = subprocess.run(["git", "config", "user.email"], stdout=subprocess.PIPE)
    return res.stdout.strip().decode()

In [3]:
journals_list = pd.read_csv("../../reviewerSelection-data/JCR_JournalResults_Sociology.csv")
print(len(journals_list))
journals_list.head()

214


Unnamed: 0,Journal name,JCR Abbreviation,ISSN,eISSN,Category,Total Citations,2022 JIF,JIF Quartile,2022 JCI,% of OA Gold
0,ANNALS OF TOURISM RESEARCH,ANN TOURISM RES,0160-7383,1873-7722,SOCIOLOGY - SSCI,19874,13.2,Q1,3.52,17.21%
1,Annual Review of Sociology,ANNU REV SOCIOL,0360-0572,1545-2115,SOCIOLOGY - SSCI,15214,10.5,Q1,4.3,1.15%
2,AMERICAN SOCIOLOGICAL REVIEW,AM SOCIOL REV,0003-1224,1939-8271,SOCIOLOGY - SSCI,25619,9.1,Q1,4.98,13.91%
3,EUROPEAN SOCIETIES,EUR SOC,1461-6696,1469-8307,SOCIOLOGY - SSCI,2158,8.1,Q1,2.54,34.97%
4,SOCIOLOGICAL METHODS & RESEARCH,SOCIOL METHOD RES,0049-1241,1552-8294,SOCIOLOGY - SSCI,8840,6.3,Q1,3.18,15.87%


In [4]:
def extract_works_relevant(list_relevant):
    '''Returns dataframe of all works from OpenAlex from all journals in list_relevant.'''
    user_mail = get_git_user()
    current_date = date.today()
    year_limit = 30
    date_limit = current_date - timedelta(days = year_limit * 365) 

    total_list = []
    for issn in list_relevant['ISSN']:
        
        curr_res_list = []
        cursor = '*'
        select = """id,authorships,publication_year,display_name,primary_location,concepts, 
        cited_by_count,language,referenced_works,related_works,abstract_inverted_index"""
        
        while cursor != None:
            filters = f'primary_location.source.issn:{issn},from_publication_date:{date_limit},to_publication_date:{current_date}'
            params = {
                'filter': filters,
                'per_page': 200,
                'cursor': cursor,
                'select': select,
                'mailto': user_mail
            }
            res = requests.get("https://api.openalex.org/works", params=params).json()
            for work in range(len(res['results'])): 
                curr_res = res['results'][work]
                curr_res_list.append(curr_res)
            cursor = res['meta'].get('next_cursor', None)
            print(len(curr_res_list))

        total_list.append(curr_res_list)
        print(len(total_list))
    
    return total_list 

In [5]:
all_sociology_works = extract_works_relevant(journals_list) 

200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3678
3678
1
200
400
600
792
792
2
200
400
600
800
1000
1200
1400
1600
1764
1764
3
200
400
600
800
972
972
4
200
400
600
800
978
978
5
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3369
3369
6
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2461
2461
7
200
218
218
8
200
400
600
772
772
9
200
400
600
800
1000
1042
1042
10
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5446
5446
11
200
400
600
800
1000
1200
1400
1600
1800
2000
2105
2105
12
200
400
600
694
694
13
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6422
6422
14
200
400
600
687
687
15
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2599
2599
16
200
400
600
800
1000
1103
1103
17
200
233
233
18
200
400
600
800
10

In [6]:
len(all_sociology_works)  # 214 journals 

214

In [7]:
import pickle 

with open('../raw_data/all_sociology_works_v1.pkl', 'wb') as f:
    pickle.dump(all_sociology_works, f)
        
#with open('../raw_data/all_sociology_works_1.0.pkl', 'rb') as f:
#    all_sociology_works = pickle.load(f)

In [2]:
len(all_sociology_works)

214

In [8]:
all_sociology_works = [entry for lists in all_sociology_works for entry in lists]

In [9]:
print(all_sociology_works[0].keys())
len(all_sociology_works)

dict_keys(['id', 'authorships', 'publication_year', 'display_name', 'primary_location', 'concepts', 'cited_by_count', 'language', 'referenced_works', 'related_works', 'abstract_inverted_index'])


269004

In [72]:
def create_abstract_from_inverted(inverted_abstract): 
    abstracts = []
    word_index = []
    if inverted_abstract:
        for k,v in inverted_abstract.items(): 
            for index in v: word_index.append([k,index])    
        abstracts.append(' '.join([word[0] for word in sorted(word_index,key = lambda x : x[1]) ])) # join words with a space 
    else: 
        abstracts.append("None")
    
    return abstracts 

["Image has been shown to be an important influence in the selection of vacation destinations. A model that represents the important determinants of destination image formation was developed based on previous studies in a number of fields. The research reported in this article presents the results of an empirical test of the model using path analysis. A major finding of the study was that a destination image is formed by both stimulus factors and tourists' characteristics. The results of this investigation provide important implications for strategic image management and can aid in designing and implementing marketing programs for creating and enhancing tourism destination images. Un modèle pour la formulation de l'image de marque des destinations. On a démontré que l'image de marque est une influence importante dans le choix de destination de vacances. En se basant sur des études précédentes dans plusieurs domaines, on a développé un modèle qui représente les déterminants importants d

In [94]:
def create_df(total_list):
    '''Create a dataframe out of the works in pages in a list. Returns df.''' 

    all_ids = []
    all_pub_years = []
    all_languages = []
    all_journal_issnls = []
    all_journal_names = []
    all_authors = []
    all_author_institutions = []
    all_titles = []
    all_concepts_name = []
    all_concepts_level = []
    all_concepts_score = []
    all_cited_counts = []
    all_referenced_works = []
    all_related_works = []
    all_abstracts = []
    

    for work in total_list:
        # IDs
        id = work['id']
        # Publication year
        pub_year = work['publication_year']
        # Language
        language = work['language']
        # Journal Name
        journal_name = work['primary_location']['source']['display_name']
        # Journal Issn_l
        journal_issnl = work['primary_location']['source']['issn_l']
        # Author list
        authors = []
        for author in work['authorships']:
            authors.append(author['author'].get('display_name', ''))
       
        # author institutions 
        author_instutitions = []
        for author in work['authorships']:
            if author["institutions"]:
                #print(author["institutions"])
                author_instutitions.append(author["institutions"][0].get("display_name", ""))
       
        # Concepts
        concepts_name = []
        concepts_level = []
        concepts_score = []
        if work["concepts"]:
            for concept in work["concepts"]:
                concepts_name.append(concept["display_name"])
                concepts_level.append(concept["level"])
                concepts_score.append(concept["score"])
                        
        # Cited by count
        cited_count = work['cited_by_count']
       
        # Title
        title = work['display_name']
        
        # Referenced and related works 
        referenced_works = work["referenced_works"] 
        related_works = work["related_works"] 
    
        # Abstract 
        abstract = create_abstract_from_inverted(work["abstract_inverted_index"])
        
        # Append to each list
        all_ids.append(id)
        all_pub_years.append(pub_year)
        all_languages.append(language)
        all_journal_issnls.append(journal_issnl)
        all_journal_names.append(journal_name)
        all_authors.append(authors)
        all_author_institutions.append(author_instutitions)
        all_titles.append(title)
        all_concepts_name.append(concepts_name)
        all_concepts_level.append(concepts_level)
        all_concepts_score.append(concepts_score)
        all_cited_counts.append(cited_count)
        all_referenced_works.append(referenced_works)
        all_related_works.append(related_works)
        all_abstracts.append(abstract)
    
    
    
    #Turn into df
    df = pd.DataFrame({
    'id': all_ids,
    'publication_year': all_pub_years,
    'language': all_languages,
    'journal_issnl': all_journal_issnls,
    'journal_name': all_journal_names,
    'authors': all_authors,
    'author_institutions': all_author_institutions,
    'title': all_titles,
    'concepts_name':all_concepts_name,
    'concepts_level':all_concepts_level,
    'concepts_score':all_concepts_score,
    'cited_by_count': all_cited_counts,
    'referenced_works':all_referenced_works,
    'related_works':all_related_works, 
    'abstracts':all_abstracts,
    })

    return df

In [95]:
all_sociology_works_df = create_df(all_sociology_works)
all_sociology_works_df[:3]

Unnamed: 0,id,publication_year,language,journal_issnl,journal_name,authors,author_institutions,title,concepts_name,concepts_level,concepts_score,cited_by_count,referenced_works,related_works,abstracts
0,https://openalex.org/W2060041453,1999,fr,0160-7383,Annals of Tourism Research,"[Seyhmus Baloglu, Ken W. McCleary]","[University of Nevada, Las Vegas, Virginia Tech]",A model of destination image formation,"[Destination image, Tourism, Humanities, Welfa...","[4, 2, 1, 1, 3, 0, 0, 0, 1]","[0.72404826, 0.69142616, 0.44406348, 0.4297011...",2420,"[https://openalex.org/W1551659116, https://ope...","[https://openalex.org/W642833299, https://open...",[Image has been shown to be an important influ...
1,https://openalex.org/W2137040040,1999,en,0160-7383,Annals of Tourism Research,[Ning Wang],[Zhongshan Hospital],Rethinking authenticity in tourism experience,"[Tourism, Sociology, Aesthetics, Geography, Ar...","[2, 0, 1, 0, 0, 1]","[0.78862435, 0.45051047, 0.41071427, 0.2684945...",2152,"[https://openalex.org/W12139949, https://opena...","[https://openalex.org/W232595385, https://open...",[Abstract This paper aims at a conceptual clar...
2,https://openalex.org/W2153383537,2000,en,0160-7383,Annals of Tourism Research,"[Dwayne Baker, John L. Crompton]",[],"Quality, satisfaction and behavioral intentions","[Psychology, Quality (philosophy), Consumer sa...","[0, 2, 2, 0, 1, 1, 0]","[0.591048, 0.55278987, 0.47371072, 0.39759916,...",1981,"[https://openalex.org/W1495489389, https://ope...","[https://openalex.org/W242256928, https://open...",[Performance quality was conceptualized as the...


In [97]:
all_sociology_works_df.shape

(269004, 15)

In [99]:
all_sociology_works_df["merged_refs"] = all_sociology_works_df["referenced_works"] + all_sociology_works_df["related_works"]

refs_all = []
for row in all_sociology_works_df["merged_refs"]:
    refs_short = []
    for url in row: 
        refs_short.append((url.split("/")[-1] ))
        
    refs_all.append(refs_short)
all_sociology_works_df["works_referenced_related"] = refs_all 

all_sociology_works_df.shape

(269004, 17)

In [102]:
all_sociology_works_df.drop(columns = "merged_refs", inplace = True)
all_sociology_works_df.head(2)

Unnamed: 0,id,publication_year,language,journal_issnl,journal_name,authors,author_institutions,title,concepts_name,concepts_level,concepts_score,cited_by_count,referenced_works,related_works,abstracts,works_referenced_related
0,https://openalex.org/W2060041453,1999,fr,0160-7383,Annals of Tourism Research,"[Seyhmus Baloglu, Ken W. McCleary]","[University of Nevada, Las Vegas, Virginia Tech]",A model of destination image formation,"[Destination image, Tourism, Humanities, Welfa...","[4, 2, 1, 1, 3, 0, 0, 0, 1]","[0.72404826, 0.69142616, 0.44406348, 0.4297011...",2420,"[https://openalex.org/W1551659116, https://ope...","[https://openalex.org/W642833299, https://open...",[Image has been shown to be an important influ...,"[W1551659116, W1554035063, W1930240810, W19663..."
1,https://openalex.org/W2137040040,1999,en,0160-7383,Annals of Tourism Research,[Ning Wang],[Zhongshan Hospital],Rethinking authenticity in tourism experience,"[Tourism, Sociology, Aesthetics, Geography, Ar...","[2, 0, 1, 0, 0, 1]","[0.78862435, 0.45051047, 0.41071427, 0.2684945...",2152,"[https://openalex.org/W12139949, https://opena...","[https://openalex.org/W232595385, https://open...",[Abstract This paper aims at a conceptual clar...,"[W12139949, W169769535, W236608154, W283541863..."


In [103]:
def concat_author_title(df):
    '''Add a column with the first authors last name and title of the work to the dataframe. Returns dataframe.'''
    df['authors_first_lastname'] = df['authors'].apply(lambda x: x[0].lower().replace('.', '').split()[-1] if len(x)>0 and x[0] is not None else '')

    df['title_slugified'] = df['title'].apply(lambda x: slugify(x) if type(x)==str else '')
    df['concat_name_title'] = df['authors_first_lastname'] + '-' + df['title_slugified']
    
    return df

In [104]:
end_df = concat_author_title(all_sociology_works_df)

In [107]:
end_df["id"] = end_df["id"].apply(lambda x: x.split("/")[-1])

In [108]:
end_df.head(2)

Unnamed: 0,id,publication_year,language,journal_issnl,journal_name,authors,author_institutions,title,concepts_name,concepts_level,concepts_score,cited_by_count,referenced_works,related_works,abstracts,works_referenced_related,authors_first_lastname,title_slugified,concat_name_title
0,W2060041453,1999,fr,0160-7383,Annals of Tourism Research,"[Seyhmus Baloglu, Ken W. McCleary]","[University of Nevada, Las Vegas, Virginia Tech]",A model of destination image formation,"[Destination image, Tourism, Humanities, Welfa...","[4, 2, 1, 1, 3, 0, 0, 0, 1]","[0.72404826, 0.69142616, 0.44406348, 0.4297011...",2420,"[https://openalex.org/W1551659116, https://ope...","[https://openalex.org/W642833299, https://open...",[Image has been shown to be an important influ...,"[W1551659116, W1554035063, W1930240810, W19663...",baloglu,a-model-of-destination-image-formation,baloglu-a-model-of-destination-image-formation
1,W2137040040,1999,en,0160-7383,Annals of Tourism Research,[Ning Wang],[Zhongshan Hospital],Rethinking authenticity in tourism experience,"[Tourism, Sociology, Aesthetics, Geography, Ar...","[2, 0, 1, 0, 0, 1]","[0.78862435, 0.45051047, 0.41071427, 0.2684945...",2152,"[https://openalex.org/W12139949, https://opena...","[https://openalex.org/W232595385, https://open...",[Abstract This paper aims at a conceptual clar...,"[W12139949, W169769535, W236608154, W283541863...",wang,rethinking-authenticity-in-tourism-experience,wang-rethinking-authenticity-in-tourism-experi...


In [109]:
end_df.to_csv(path_or_buf="../raw_data/all_sociology_works_v1.csv", index = False)