In [5]:
from src.searcher import Searcher
import re
import pickle
import pandas as pd

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 10)
pd.set_option("display.width", 1000)

In [6]:
#authenticate, insert your API key in src/config.json
sr = Searcher("src/config.json")

In [7]:
# keywords for each topics are defined
# using constructors "TITLE-ABS-KEY" to search, title, abstract and keywords

sustainability_q = ' TITLE-ABS-KEY("lca" OR "life cycle" OR "life-cycle" OR "sustainab*" OR "life cycle assessment" OR "environment*") '
resilience_q = ' TITLE-ABS-KEY("resilience" OR "disrupti*" OR "vulnerab*" OR "critical*" OR "robust*") '
complexity_q = ' TITLE-ABS-KEY("complexity" OR "agent-based" OR "agent based" OR "topolo*" OR "complex adaptive system" OR "networks" OR "network" OR "network analysis" ) '
supply_q = ' TITLE-ABS-KEY("supply chain" OR "supply network" OR "supply networks" OR "supply networks" OR "supply chains") '
restrictions = " SRCTYPE(j) AND PUBYEAR AFT 2000"
separator = " AND "

In [None]:
fields_1 = [resilience_q, complexity_q, supply_q, restrictions]
fields_2 = [sustainability_q, resilience_q, supply_q, restrictions]
fields_3 = [sustainability_q, complexity_q, supply_q, restrictions]
fields_4 = [sustainability_q, resilience_q, complexity_q, supply_q, restrictions]

queries = [separator.join(field) for field in [fields_1, fields_2, fields_3, fields_4]]
#
for query in queries:
    print(sr.get_search(query).tot_num_res)
# Number of articles per query (requested on 08/03/2021)
# 2983
# 3596
# 4717
# 985

In [None]:
all_papers_dict = {}
all_papers_df = pd.DataFrame()

for idx, query in enumerate(queries):
    query_dict, query_df = sr.search_query(query, get_all = True) #returns a dict and a df

    all_papers_dict = {**all_papers_dict, **query_dict}
    #discards duplicates
    all_papers_df = pd.concat([all_papers_df, query_dict]).drop_duplicates().reset_index(drop=True)

    #stores a pickle file to store the dict response that contains data and request metadata
    pickle.dump(query_dict, open('data/query_dict_test'+str(idx)+'.p', 'wb'))
    query_df.to_csv('data/query_df_'+str(idx)+'.csv', index=False)

pickle.dump(all_papers_dict, open('data/all_papers'+'.p', 'wb'))
all_papers_df.to_csv('data/all_papers_df'+'.csv')

In [8]:
#This step uses the stored data to avoid making queries when experimentation is required
# Merging dicts
query_dict0 = pickle.load(open('data/query_dict_test0.p','rb'))
query_dict1 = pickle.load(open('data/query_dict_test1.p','rb'))
query_dict2 = pickle.load(open('data/query_dict_test2.p','rb'))
query_dict3 = pickle.load(open('data/query_dict_test3.p','rb'))

all_papers_dict = {**query_dict0, **query_dict1, **query_dict2, **query_dict3}

#Merging df's
df_0 = pd.read_csv('data/query_df_0.csv').drop(["Unnamed: 0"], axis=1)
df_1 = pd.read_csv('data/query_df_1.csv').drop(["Unnamed: 0"], axis=1)
df_2 = pd.read_csv('data/query_df_2.csv').drop(["Unnamed: 0"], axis=1)
df_3 = pd.read_csv('data/query_df_3.csv').drop(["Unnamed: 0"], axis=1)

all_papers_df = pd.concat([df_0, df_1, df_2, df_3]).drop_duplicates().reset_index(drop=True)

In [None]:
def curate_for_cortext(_df):
    df = _df.fillna(' ')
    df['year'] = df['year'].astype('int16')
    df['keywords'] = df['keywords'].apply(lambda kw: re.sub(' \| ', ' *** ', kw))
    return df
#To use CorText we can replace nan with ' '
all_papers_for_cortext = curate_for_cortext(all_papers_df)
all_papers_for_cortext.to_csv('data/papers_for_cortext.csv', index=False)

final_search_cortext = curate_for_cortext(df_3)
final_search_cortext.to_csv('data/papers_for_cortext_final.csv', index=False)

In [None]:
#importing from WOS
# No API was available, manual exportation was performed.
wos_0 = pd.read_csv('data/wos_0.csv')
wos_1 = pd.read_csv('data/wos_1.csv')
wos_2 = pd.read_csv('data/wos_2.csv')
wos = pd.concat([wos_0,wos_1,wos_2])
wos = wos[['Abstract', 'DOI', 'Source Title', 'Author Keywords','UT (Unique WOS ID)', 'Article Title', 'Publication Year']]
wos.columns = final_search_cortext.columns
wos['keywords'] = wos['keywords'].fillna(' ')
wos['year'] = wos['year'].fillna(2021)
wos['year'] = wos['year'].astype('int16')
wos['keywords'] = wos['keywords'].apply(lambda kw: re.sub('; ', ' *** ', kw))

In [None]:
#merging WOS and SCOPUS
full_final = pd.concat([final_search_cortext, wos]).drop_duplicates(subset=['doi']).reset_index(drop=True)
full_final['journal'] = full_final['journal'].apply(lambda journal: journal.lower())
full_final= full_final.sort_values(by=['year'], ascending=False)
full_final.to_csv('data/wos_scopus_for_cortext.csv', index=False)