Import tables for items, abstracts and references

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
scp_items = pd.read_csv(r'.\data_evolutionary_game_theory\items_scp_evolgath_202302021205.csv', index_col=None)
scp_abstracts = pd.read_csv(r'.\data_evolutionary_game_theory\abstracts_scp_evolgath_202302021207.csv', index_col=None)
scp_refs = pd.read_csv(r'.\data_evolutionary_game_theory\refs_scp_evolgath_202302021207.csv', index_col=None)
wos_items = pd.read_csv(r'.\data_evolutionary_game_theory\items_wos_evolgath_202301271505.csv', index_col=None)
wos_abstracts = pd.read_csv(r'.\data_evolutionary_game_theory\abstracts_wos_evolgath_202301271503.csv', index_col=None)
wos_refs = pd.read_csv(r'.\data_evolutionary_game_theory\refs_wos_evolgath_202301271505.csv', index_col=None)

  wos_refs = pd.read_csv(r'.\data_evolutionary_game_theory\refs_wos_evolgath_202301271505.csv', index_col=None)


Start first steps for perspective networks: Comparison of WoS and Scopus data sets

In [3]:
scp_items.loc[:,'item_title']
wos_items.loc[:,'item_title'] 
scp_items['item_title'].equals(wos_items['item_title']) #scp and wos do not hold the same publications

# Which titles are included in both scp and wos?
list(set(scp_items.item_title) & set(wos_items.item_title)) 

['Almost global convergence to p-dominant equilibrium',
 'Incentive policies for transboundary marine spatial planning: an evolutionary game theory-based analysis',
 'Models as products of interdisciplinary exchange: Evidence from evolutionary game theory',
 'The networked cooperative dynamics of adjusting signal strength based on information quantity',
 'An evolutionary game theory explanation of ARCH effects',
 'An integrated approach to evaluating sustainability in supply chains using evolutionary game theory',
 'Game analysis on prefabricated building evolution based on dynamic revenue risks in China',
 'The phenotypic gambit: selective pressures and ESS methodology in evolutionary game theory',
 'Market sentiments and convergence dynamics in decentralized assignment economies',
 'Evolutionary Game Theoretic Analysis of Advanced Persistent Threats Against Cloud Storage',
 'The predator-dependent replicator dynamics',
 'Variability in group size and the evolution of collective actio

In [4]:
# Which item_id do certain titles hold?
scp_items.loc[scp_items['item_title'] == 'The N-Player Trust Game and its Replicator Dynamics', 'item_id']
wos_items.loc[wos_items['item_title'] == 'The N-Player Trust Game and its Replicator Dynamics', 'item_id'] 

329    WOS:000377620600011
Name: item_id, dtype: object

WoS and Scopus data sets hold different item_ids for the same articles. WoS holds more articles and is therefore used in the following.

Filter for duplicates:

In [5]:
wos_items.item_id.nunique() # 1501
wos_refs.item_id_cited.nunique() #unique IDs for cited articles # 38708

38708

In [7]:
wos_refs['edited_title'] = wos_refs.ref_item_title.str.lower() #new column with all titles in lower-case
wos_refs.loc[:,'edited_title'] #Length lower-cased: 59935
wos_refs.ref_item_title.nunique() # Length: 29667
wos_refs.edited_title.nunique() # Length: 29443 (nunique drops NA by default, if not wanted, add dropna=False)
wos_refs['edited_title'].isna().sum() # 8435 NA values in edited_title
wos_refs.edited_title.nunique(dropna=False) # NA counts as 1 # 29444

series = wos_refs.groupby("item_id_cited")['item_id_citing'].nunique() #group the cited id by the citing id!
print(series) # output is equal for wos_refs.item_id_cited.nunique()

item_id_cited
000208243800003.1         1
000208243800003.10        1
000208243800003.11        1
000208243800003.12        1
000208243800003.14        1
                         ..
ZOOREC:ZOOR15103017478    1
ZOOREC:ZOOR15301000687    1
ZOOREC:ZOOR15301000688    1
ZOOREC:ZOOR15403013001    1
ZOOREC:ZOOR15407039974    1
Name: item_id_citing, Length: 38708, dtype: int64


Citation threshold

In [8]:
series.loc[series.values > 1] #cited more than once # 5291
wos_refs.loc[wos_refs.item_id_cited.isin(series.loc[series.values > 1].index)] 
#get where item_id_cited is in the series of cited more often than once

Unnamed: 0,item_id_citing,item_id_cited,citing_pubyear,ref_seq_nr,ref_pubyear,ref_item_title,ref_source_title,ref_authors,ref_volume,scopus_ref_issue,ref_pages,ref_doi,wos_ref_article_number,scopus_ref_text,scopus_ref_fulltext,wos_citation_context,edited_title
0,WOS:000508289100054,WOS:000223113500001,2017,1,2004.0,Toward a metabolic theory of ecology,ECOLOGY,"{""Brown, JH""}",85,,1771,,,,,,toward a metabolic theory of ecology
2,WOS:000508289100054,WOS:000073092100002,2017,3,1998.0,On economic applications of evolutionary game ...,JOURNAL OF EVOLUTIONARY ECONOMICS,"{""Friedman, D""}",8,,15,,,,,,on economic applications of evolutionary game ...
21,WOS:000274798200004,WOS:000188753800045,2009,7,2004.0,Evolutionary dynamics of biological games,SCIENCE,"{""Nowak, MA""}",303,,793,,,,,,evolutionary dynamics of biological games
22,WOS:000274798200004,WOS:000223799100054,2009,8,2004.0,Vaccination and the theory of games,PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCE...,"{""Bauch, CT""}",101,,13391,,,,,,vaccination and the theory of games
24,WOS:000274798200004,WOS:000257287600008,2009,10,2008.0,Fluctuating epidemics on adaptive networks,PHYSICAL REVIEW E,"{""Shaw, LB""}",77,,ARTN 066101,10.1103/PhysRevE.77.066101,,,,,fluctuating epidemics on adaptive networks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59928,WOS:000600775500029,WOS:000248365100001,2021,49,2007.0,Evolutionary games on graphs,PHYSICS REPORTS-REVIEW SECTION OF PHYSICS LETTERS,"{""Szabo, G""}",446,,97,10.1016/j.physrep.2007.04.004,,,,,evolutionary games on graphs
59929,WOS:000600775500029,WOS:000365365300007,2021,50,2016.0,Evolution of cooperation in the spatial public...,PHYSICS LETTERS A,"{""Chen, MH""}",380,,40,10.1016/j.physleta.2015.09.047,,,,,evolution of cooperation in the spatial public...
59930,WOS:000600775500029,WOS:000325842000025,2021,51,2013.0,Effect of assessment error and private informa...,CHAOS SOLITONS & FRACTALS,"{""Uchida, S""}",56,,175,10.1016/j.chaos.2013.08.006,,,,,effect of assessment error and private informa...
59932,WOS:000600775500029,WOS:000313312200012,2021,53,2012.0,Gun for hire: Delegated enforcement and peer p...,JOURNAL OF PUBLIC ECONOMICS,"{""Andreoni, J""}",96,,1036,10.1016/j.jpubeco.2012.08.003,,,,,gun for hire: delegated enforcement and peer p...


Dictionary to find all duplicates for reference titles

In [10]:
dictedtit={} # think of a better name...

for index,s in wos_refs.iterrows(): #loop over rows in our df
    if s['edited_title'] in dictedtit: #if title already exists in our created dictionary
        if s['item_id_cited'] not in dictedtit[s['edited_title']]: #and id_item_cited is still not present in our dictionary
            dictedtit[s['edited_title']].append(s['item_id_cited']) #then append the id_item_cited to the existing title
    else: 
        dictedtit[s['edited_title']] = [s['item_id_cited']] #if not, create new key title with item_id_cited as a value
        
len(dictedtit) # 29444
print(dictedtit['toward a metabolic theory of ecology']) # testwise, gives item id cited for this title ['WOS:000223113500001']
# print(dictedtit) # IOPub data rate exceeded

print(list(dictedtit.items())[:100])

newdict = {k: dictedtit[k] for k in list(dictedtit)[:20]}

print(newdict) #gives chunk of 20 titles

['WOS:000223113500001']
[('toward a metabolic theory of ecology', ['WOS:000223113500001']), ('research on the function mechanism and implementation path and its application of ppp model in guangxi', ['000508289100054.9']), ('on economic applications of evolutionary game theory', ['WOS:000073092100002', '000365283500228.7']), (nan, ['000508289100054.8', 'WOS:000271211500265.3', 'WOS:000271211500265.4', 'WOS:000274798200004.1', 'WOS:000274798200004.27', 'WOS:000274798200004.30', 'WOS:000274798200004.18', 'WOS:000274798200004.29', 'WOS:000274798200004.26', '000648720900001.3', '000648720900001.14', '000648720900001.1', '000648720900001.8', '000648720900001.2', '000391330900109.22', 'WOS:A1996UH65400003.14', 'WOS:A1996UH65400003.19', 'WOS:A1996UH65400003.2', 'WOS:A1996UH65400003.50', 'WOS:A1996UH65400003.6', 'WOS:A1996UH65400003.8', 'WOS:A1996UH65400003.13', 'WOS:A1996UH65400003.24', 'WOS:A1996UH65400003.16', 'WOS:A1996UH65400003.11', 'WOS:A1996UH65400003.44', 'WOS:A1996UH65400003.7', 'WOS

Key-value pairs with > 1 value entries (id_item_cited) need relabeling

In [13]:
dictduplicate = {k: dictedtit[k] for k in dictedtit if len(dictedtit[k]) > 1} 

print(dictduplicate)
len(dictduplicate) # 549

wos_refs.isna()
print( sorted(dictduplicate.items(), key= lambda x: len(x[1]), reverse=True) ) #sort by number of values ()


del dictduplicate[np.nan] # nan needs the np in front!!
dictduplicatesorted = sorted(dictduplicate.items(), key= lambda x: len(x[1]), reverse=True) # sort by number of values

print(dictduplicatesorted)



In [14]:
dfduplicated =pd.DataFrame.from_dict(dict(dictduplicatesorted), orient='index') # df for better visibility
dfduplicated.to_csv('titles_with_multi_ids.csv', encoding='utf-8')
dfduplicated #check only really for the bigger first one, the other ones are most probably the same

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
evolutionary game theory,000604466500067.7,000527769700042.48,000498881800052.4,000296007000001.31,000550684000036.9,000297819200015.9,000729391000001.9,000464880100077.10,000458654000246.16,000607329000002.10,...,000372687500050.19,000525401500057.17,000315021200006.9,000457952400133.21,000774489700001.18,000441998400054.33,000859578700001.25,000467546000002.35,000692237100001.25,000303058200007.3
stochastic evolutionary game dynamics,WOS:A1990EE29600005,000550684000036.11,000705615300001.39,000451665100001.52,000358468800002.95,000208819200007.6,000328609300009.7,000390640200001.104,000824314200001.16,000412164500007.60,...,000308576500015.26,000453111000009.79,000506466300026.101,000703998200014.35,000810735600001.16,000461087100001.127,000399887700006.65,000852211500002.70,000686555900007.68,
population dynamics from game theory,000443369200005.34,000298615101113.14,000390640200001.114,000770156100001.47,000463926800012.52,000390611800074.56,000315578000022.49,000208819200003.51,000739761400001.34,000403140800143.23,...,000301755700014.69,000303079500016.61,000460672300005.26,000691225900001.201,000466888100007.68,000330149600013.44,000493323600001.121,,,
evolutionary game theory: a renaissance,000446715900001.27,000454371800006.35,000607329000002.21,000735059600001.27,000506466300026.57,000545174700001.18,000504781200007.19,000610240600001.34,000543362400008.15,000467248800010.27,...,,,,,,,,,,
random processes in genetics,000503281500014.23,000318996500002.85,000327688800029.19,000506466300026.56,000447274400007.35,000841295800002.84,000592161400001.27,000778570300002.41,000686555900007.49,000315428400001.13,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
a markov decision evolutionary game for individual energy management,000371252600031.6,WOS:000307321200022,,,,,,,,,...,,,,,,,,,,
delayed evolutionary game dynamics applied to the medium access control,000371252600031.10,000328058400031.21,,,,,,,,,...,,,,,,,,,,
a rule is not a rule if it changes from case to case (a reply to marshall's comment),000301755700005.74,WOS:000286406700022,,,,,,,,,...,,,,,,,,,,
a dynamic approach to the analysis of strategic alliances,000356820600015.26,000332429300010.21,,,,,,,,,...,,,,,,,,,,


Create replacing dictionary and a new column for the wos_refs df

In [15]:
wos_refs['item_id_clear'] = wos_refs['item_id_cited'] #clone item_id_cited column to later replace values only in there
wos_refs.head() #worked, new column created
type(dictduplicate) # dict

dictreplace ={v[0]: v[1:] for v in dictduplicate.values()} 
# ignore keys aka titles; take first id as key, remaining ids until last as values
# goal: {'BCI:BCI19522600023673': ["A", "B"]} 
dfdictreplace=pd.DataFrame.from_dict(dict(dictduplicatesorted), orient='index') # df for better visibility
dfdictreplace
# would need double looping through df and value lists, better try rearranging dictionary to long format

dictreplace_alt = {i: k for k,v in dictreplace.items() for i in v} # needs .items() wtf, otherwise returns meta stuff
# k becomes first id
# v is remaining list of other ids

# same as:
#dictreplace_alt = {}
#for k,v in dictreplace.items():
#    for i in v:
#        dictreplace_alt[i]=k

dictreplace_alt

{'000365283500228.7': 'WOS:000073092100002',
 '000453704700031.4': 'WOS:000223799100054',
 'WOS:000242408800016': '000391330900109.20',
 'WOS:A1992JV77700063': '000391330900109.9',
 '000378641000001.15': 'WOS:A1979HZ08500018',
 '000432507200005.21': 'WOS:A1979HZ08500018',
 '000326414400011.61': 'WOS:A1979HZ08500018',
 '000301755700018.36': 'WOS:A1978FK24700004',
 '000301970600014.15': 'WOS:A1978FK24700004',
 '000311051000007.11': 'WOS:A19647207B00008',
 '000334332400001.39': '000632837000003.16',
 '000309990600011.54': 'WOS:000268616500013',
 '000450854400008.52': 'WOS:000302127800001',
 '000527769700042.19': 'WOS:000353463600013',
 '000395731000002.18': 'WOS:000314285400024',
 '000472872700021.11': 'WOS:000365192500001',
 '000463926800012.41': 'WOS:000365192500001',
 '000466888100007.56': 'WOS:000365192500001',
 '000361767900008.25': '000817324700007.14',
 '000399887700006.20': '000817324700007.14',
 '000860281200006.38': '000817324700007.14',
 '000428251402116.9': 'WOS:00031581280004

In [16]:
wos_refs2 = wos_refs.replace({'item_id_clear':dictreplace_alt})
wos_refs.compare(wos_refs2)

# check titles with most ids
wos_refs2.loc[wos_refs['edited_title'] == 'evolutionary game theory'] # title needs manual check
wos_refs2.loc[wos_refs['edited_title'] == 'stochastic evolutionary game dynamics'] # title needs manual check
wos_refs2.loc[wos_refs['edited_title'] == 'population dynamics from game theory'] # all the same
wos_refs2.loc[wos_refs['edited_title'] == 'evolutionary game theory: a renaissance'] # all the same
wos_refs2.loc[wos_refs['edited_title'] == 'random processes in genetics'] # all the same
wos_refs2.loc[wos_refs['edited_title'] == 'the emergence of commitments and cooperation'] # all the same, checked manually, deviation due to changed author order

wos_items.item_title.nunique()
wos_refs2.item_id_clear.nunique() # 37870

37870

In [17]:
## ATTENTION FOR FUTURE VIVI: INDEXING HAPPENING HERE!

# manual check for distinct articles with title "evolutionary game theory"
weibull = [2809,19703,21738,34994,48101,56850] # weibull
wos_refs2.loc[weibull,'item_id_clear'] = '000527769700042.48' 
wos_refs2.at[7354, 'item_id_clear'] = '000498881800052.4' # easley ; this guy's name is not easeley btw, can't find publication
sandholm = [8738, 9257, 19512, 52016, 53875] # sandholm
wos_refs2.loc[sandholm,'item_id_clear'] = '000296007000001.31'
alexander = [12024, 29023, 32574, 37342, 44444, 57100] # J. McKenzie Alexander
wos_refs2.loc[alexander,'item_id_clear'] = '000297819200015.9'
wos_refs2.at[27274, 'item_id_clear'] = '000607329000002.10' # cressman
wos_refs2.at[37946, 'item_id_clear'] = '000525401500057.17' # smith
wos_refs2.at[46560, 'item_id_clear'] = '000457952400133.21' # siegmund
wos_refs2.at[50085, 'item_id_clear'] = '000441998400054.33' # vincent


# manual check for distinct articles with title "stochastic evolutionary game dynamics"
pd.set_option('display.max_rows', None) # force jupyter notebook to show me all the rows
# need to reduce down from 68 rows
stoch = wos_refs2.loc[wos_refs2['edited_title'] == 'stochastic evolutionary game dynamics']  # title needs manual check
stoch.loc[stoch['item_id_cited'] != 'WOS:A1990EE29600005']
# foster authors can keep their given item_id_clear
wallace = [9259, 56675] # wallace
wos_refs2.loc[wallace,'item_id_clear'] = '000550684000036.11' 

traulsen = [9911, 10864, 11060, 13370, 16064, 18399, 21053, 
            21296, 26125, 30109, 34608, 36098, 43663, 47440, 48532, 50225, 58148] # traulsen & hauert 
wos_refs2.loc[traulsen,'item_id_clear'] = '000705615300001.39' 

In [18]:
# drop useless columns that contain mostly nans
wos_refs2 = wos_refs2.drop(columns=['scopus_ref_issue', 
                        'wos_ref_article_number', 
                        'scopus_ref_text', 
                        'scopus_ref_fulltext', 
                        'wos_citation_context' ])
wos_refs2.head()

Unnamed: 0,item_id_citing,item_id_cited,citing_pubyear,ref_seq_nr,ref_pubyear,ref_item_title,ref_source_title,ref_authors,ref_volume,ref_pages,ref_doi,edited_title,item_id_clear
0,WOS:000508289100054,WOS:000223113500001,2017,1,2004.0,Toward a metabolic theory of ecology,ECOLOGY,"{""Brown, JH""}",85.0,1771.0,,toward a metabolic theory of ecology,WOS:000223113500001
1,WOS:000508289100054,000508289100054.9,2017,2,2016.0,Research on the Function Mechanism and Impleme...,Journal of Regional Financial Research,"{""Yuan, Zhao-xia""}",,48.0,,research on the function mechanism and impleme...,000508289100054.9
2,WOS:000508289100054,WOS:000073092100002,2017,3,1998.0,On economic applications of evolutionary game ...,JOURNAL OF EVOLUTIONARY ECONOMICS,"{""Friedman, D""}",8.0,15.0,,on economic applications of evolutionary game ...,WOS:000073092100002
3,WOS:000508289100054,000508289100054.8,2017,4,1997.0,,Evolutionary game theory,"{""Weibull, Jorge NW""}",,,,,000508289100054.8
4,WOS:000508289100054,000508289100054.7,2017,5,2014.0,Research on PPP project financing efficiency f...,Science Research Management,"{""Tang, Wei""}",,157.0,,research on ppp project financing efficiency f...,000508289100054.7


In [None]:
wos_refs2.to_csv('wos_refs_cleaned.csv')