In [95]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
import matplotlib as mpl


# Fonts for plots
plt.rcParams['font.family'] = 'serif'
mpl.rcParams['font.serif'] = 'Times New Roman'

pd.set_option('display.max_columns', None)

# Path to data retrieval and storage
path = "C:/Users/kleinow/ownCloud/MA_Neuro"

In [96]:
# load reference data
cn_refs = pd.read_csv(path + '/cn_refs.csv')
cn_refs.head()
cn_refs.shape # (81694, 16)

  cn_refs = pd.read_csv(path + '/cn_refs.csv')


(81694, 16)

Removal of any full duplicate rows

In [97]:
cn_refs = cn_refs.drop_duplicates().reset_index(drop=True) # considers the entire row duplicates
cn_refs.shape # (81694, 16); no full duplicate rows in the retrieved data set, which is reasonable

(81694, 16)

Adding a column that contains the cited titles in lower-case

In [98]:
cn_refs['title_lower'] = cn_refs['ref_item_title'].str.lower()
#cn_refs.head()

cn_refs.title_lower.nunique() # 49802, drops na as default

49802

Trying out the first citation threshold before final cleaning: cited more than once, otherwise no connection to other papers possible

In [99]:
series_cn_refs = cn_refs.groupby("item_id_cited")['item_id_citing'].nunique() #  Group the cited id by the citing id
#print(series_cn_refs)

# Cited items with more than one citing item
ab1_cn_refs = series_cn_refs.loc[series_cn_refs.values > 1]

# Show positions of the cited items with more than one citing item
#cn_refs.loc[cn_refs.item_id_cited.isin(ab1_cn_refs.index)] # 30914 rows × 17 columns

Creating dictionaries to rename duplicates of cited reference ids that accidentally were assigned a new ID; adding a new column with the revised ids

In [100]:

dictcn={}

for index,s in cn_refs.iterrows(): # Loop over rows in our df
    if s['title_lower'] in dictcn: # If title already exists in our created dictionary,
        if s['item_id_cited'] not in dictcn[s['title_lower']]: # and id_item_cited is still not present in our dictionary...
            dictcn[s['title_lower']].append(s['item_id_cited']) # then append the id_item_cited to the existing title
    else: 
        dictcn[s['title_lower']] = [s['item_id_cited']] #if not, create new key title with item_id_cited as a value
        
len(dictcn) # 49803

49803

In [101]:
# Check if the dictionary is correct

print(list(dictcn.items())[:100])

chunk_dictcn = {k: dictcn[k] for k in list(dictcn)[:20]}
print(chunk_dictcn) # Gives chunk of 20 titles

[('cortex of cerebellum', ['WOS:A1975V163400004']), (nan, ['WOS:000086279300008.22', 'WOS:000086279300008.30', 'WOS:000086279300008.13', 'WOS:000086279300008.1', 'WOS:000086279300008.29', 'WOS:000086279300008.8', 'WOS:000086279300008.10', 'WOS:000086279300008.21', 'WOS:000086279300008.4', 'WOS:000086279300008.9', 'WOS:000086279300008.24', 'WOS:000086279300008.11', 'WOS:000086279300008.2', 'WOS:000086279300008.23', 'WOS:000088763200026.15', 'WOS:000088763200026.16', 'WOS:000088763200026.21', 'WOS:000088763200026.8', 'WOS:000088763200026.1', 'WOS:000088763200026.20', 'WOS:000088763200026.19', 'WOS:000088763200026.2', 'WOS:000169285300025.20', 'WOS:000169285300025.25', 'WOS:000169285300025.13', 'WOS:000169285300025.10', 'WOS:000169285300025.34', 'WOS:000169285300025.31', 'WOS:000169285300025.26', 'WOS:000169285300025.2', 'WOS:000173024600148.9', 'WOS:000173024600148.1', 'WOS:000173024600148.5', 'WOS:000174600200006.1', 'WOS:000174600200006.9', 'WOS:000174600200006.4', 'WOS:000174600200006

Dictionary Key-Value pairs with more than 1 entry need to be relabeled, otherwise the same references might have different labels.

In [102]:
dictdupcn = {k: dictcn[k] for k in dictcn if len(dictcn[k]) > 1} 

#print(dictdupcn)
len(dictdupcn) # 549

#cn_refs.isna()
#print(sorted(dictdupcn.items(), key= lambda x: len(x[1]), reverse=True) ) # Sort by number of values

del dictdupcn[np.nan] # nan needs the np in front!!
dictdupcn_sorted = sorted(dictdupcn.items(), key= lambda x: len(x[1]), reverse=True) # Sort by number of values

In [103]:
print(dictdupcn_sorted)

[('nest (neural simulation tool)', ['000311837300002.31', '000345024600003.9', '000579856100001.12', '000348207000001.18', '000370606600001.25', '000426546200001.10', '000430129400001.34', '000455034000009.6', '000596843300001.10', '000642586500006.34', '000646233500012.35', '000647460100001.11', '000876518600001.2', '000934089300002.12', '000209207100026.14', '000305415000005.10', '000373641800003.28', '000425314200001.19', '000444235500012.19', '000804814900001.23', '000311837300008.10', '000343228000001.19', '000419789500012.11', '000429596100002.15', '000499854500001.13', '000806558700001.11', '000384452300003.11', '000432839000001.10', '000460392200012.15', '000656962800012.3', '000804029700001.21', '000810997200001.31', '000386313100026.2', '000453105700001.32', '000450516000020.58', '000502758100001.16', '000528677400001.33', '000209207300033.3', '000305415000003.53', '000311837300007.19', '000348109300001.11', '000380668600001.14', '000478905500001.22', '000726266400001.17', '0

Create dataframe for lower-case titles and the assigned multiple IDs

In [104]:
# create df for cn_refs multiple ids per lower case reference title
dfdupcn=pd.DataFrame.from_dict(dict(dictdupcn_sorted), orient='index') # df for better visibility
dfdupcn.to_csv('cn_refs_titles_with_multi_ids.csv', encoding='utf-8')
dfdupcn.head(20) # 796 rows × 63 columns

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62
nest (neural simulation tool),000311837300002.31,000345024600003.9,000579856100001.12,348207000001.18,000370606600001.25,000426546200001.10,430129400001.34,455034000009.6,596843300001.1,642586500006.34,646233500012.35,647460100001.11,876518600001.2,934089300002.12,209207100026.14,305415000005.1,373641800003.28,425314200001.19,444235500012.19,804814900001.23,311837300008.1,343228000001.19,419789500012.11,429596100002.15,499854500001.13,806558700001.11,384452300003.11,432839000001.1,460392200012.15,656962800012.3,804029700001.21,810997200001.31,386313100026.2,453105700001.32,450516000020.58,502758100001.16,528677400001.33,209207300033.3,305415000003.53,311837300007.19,348109300001.11,380668600001.14,478905500001.22,726266400001.17,736022600001.28,805555900001.27,337948500002.17,348206200001.2,454180100015.9,495242400010.13,811824200001.16,937441800001.35,963727400001.27,209207300041.9,380500900617.4,396398700015.18,406561700001.2,449250100001.27,536333100001.13,581985500047.5,823396700001.27,859912400038.12,933363400001.62
imagenet classification with deep convolutional neural networks,000395099500043.47,000432199000012.32,000577089300003.38,626471200001.37,INSPEC:17133663,WOS:000402555400026,443157700005.85,486630500017.96,744537200017.8,434779300020.36,472127600009.32,595874700013.1,922928209014.34,379319900018.56,495400000052.72,621797000017.26,518892900013.4,390601400001.13,575874100001.47,336976000076.2,457636800259.19,383808700002.32,429191800009.92,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
very deep convolutional networks for large-scale image recognition,000577089300003.56,000739027800001.63,000744537200017.14,370822200004.62,000472127600009.74,000595874700013.2,883330900010.19,621797000017.37,416196400023.2,442861600066.26,617031000014.15,776460100001.53,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
adam: a method for stochastic optimization,000548564900074.23,000579856100001.22,000450651000031.8,577089300003.35,000626471200001.35,000870207000002.14,555729900082.11,617614000002.6,612948000029.6,922928203018.23,465513800001.27,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
learning from the past: approaches for reproducibility in computational neuroscience,000311837300002.17,000429167300001.37,000384570200006.37,430129400001.18,000454422200001.15,000431227400001.28,348109300001.9,209207300041.3,933363400001.49,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
a logical calculus of the ideas immanent in nervous activity,BCI:BCI19441800008456,000597437600001.101,000733527400001.1,335628800008.3,000379319900018.66,000432567500008.267,443568600010.43,366638300005.4,429191800009.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
every good regulator of a system must be a model of that system,000365919200001.55,000423004600007.27,000902758000001.6,459591300004.36,000505176000001.23,000651122200003.29,966581800001.64,444729800007.19,501388000039.32,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
pynn: a common interface for neuronal network simulators,000295220400001.10,000430891900005.21,000434813800008.11,384452300003.6,000460392200012.10,000364154200006.3,415207300002.6,451351400001.1,385253800001.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
introducing the neuroscience gateway,000384570200006.133,000384452300003.36,000394260000006.85,401795900086.12,000349087800017.24,000760910506074.25,406561700001.51,857255200001.61,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
nineml: the network interchange for neuroscience modeling language,000384570200006.124,000209207100026.41,000305415000005.20,806558700001.26,000348109300001.24,000394260000004.19,449250100001.7,933363400001.57,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


The titles with multiple references need to be checked manually in one of the following steps. For very specific titles and similar meta information, we can assume the publications to actually be the same.

Replacing dictionary and adding new column with clean item_id_cited

In [105]:
cn_refs['item_id_clear'] = cn_refs['item_id_cited'] # Clone item_id_cited column to later replace values only in there
cn_refs.head() # new column created
type(dictdupcn) # dict

dictreplace_cn ={v[0]: v[1:] for v in dictdupcn.values()} 
# ignore keys aka titles; take first id as key, remaining ids until last as values
# goal: {'BCI:BCI19522600023673': ["A", "B"]} 
dfdictreplace_cn=pd.DataFrame.from_dict(dict(dictdupcn_sorted), orient='index') # df for better visibility
dfdictreplace_cn
# rearranging dictionary to long format to avoid double looping through df and value lists

dictreplace_cn_alt = {i: k for k,v in dictreplace_cn.items() for i in v} # needs .items(), otherwise returns meta stuff
# k becomes first id
# v is remaining list of other ids

# same as:
#dictreplace_alt = {}
#for k,v in dictreplace.items():
#    for i in v:
#        dictreplace_alt[i]=k

dictreplace_cn_alt

{'000447832000018.64': 'WOS:000083883300021',
 '000553719400001.16': 'WOS:A1997BH93C00030',
 '000452649401032.23': 'WOS:A1952UH81500008',
 '000402008600074.5': 'WOS:A1952UH81500008',
 '000301568100007.23': 'WOS:000171017300008',
 '000331486100014.7': 'WOS:000167248500017',
 'WOS:A1990BT68K00088': 'WOS:A1990HB91800089',
 '000313591500023.3': 'WOS:000231235700016',
 '000460855900002.174': '000209165300094.87',
 '000486417800002.95': '000209165300094.87',
 'WOS:A1983RE67600006': 'WOS:000275149200009',
 '000298103700001.5': 'WOS:000208047200002',
 '000339052800001.22': '000209204100004.15',
 '000345024600005.34': '000209529000006.10',
 'WOS:000230925500136': '000209529000006.14',
 '000426982100013.18': '000209529000006.14',
 'WOS:000082395600005': 'WOS:000221563000002',
 '000397326500005.11': 'WOS:A1997XM55000001',
 '000373939100076.8': 'WOS:A1997XM55000001',
 '000337948500002.24': 'WOS:A1997XM55000001',
 '000312652100120.5': 'CCC:000079284100002',
 'WOS:A1945UH01400003': 'WOS:000200527300

In [106]:
cn_refs2 = cn_refs.replace({'item_id_clear':dictreplace_cn_alt})

# Check for overlaps between old and new item_id_cited
cn_refs.compare(cn_refs2) #1337 rows × 2 columns

Unnamed: 0_level_0,item_id_clear,item_id_clear
Unnamed: 0_level_1,self,other
576,WOS:A1945UH01400003,WOS:000200527300004
1885,000301568100007.23,WOS:000171017300008
2134,WOS:000283727800004,000299100900007.2
2568,WOS:000176839200031,000333946600016.52
2974,000345024600003.9,000311837300002.31
3184,000360499900014.21,CCC:000224129000015
4326,WOS:000283727800004,000299100900007.2
4544,000423004600007.27,000365919200001.55
4560,000423004600007.58,WOS:000309222000007
4706,000429167300001.37,000311837300002.17


Now, the item_ids of the references, that have multiple IDs for the same tile, are checked manually. If the titles come out to be the same, the item_id_clear are replaced with the first one occuring the dataframe. If the titles are in fact different, they are re-assigned their old item_id_cited.
It is important to tkae care of the indexing if anything changes in the original dataframe!

The first 20 entries are checked by defualt. Further references' item_id_clear are checked if the title is generic and could potentially hold different publications by different authors.

In [107]:
pd.set_option('display.max_rows', None) # Show all rows

In [108]:
# check titles with most ids, take care of indexing
# cn_refs2.loc[cn_refs2['title_lower'] == 'nest (neural simulation tool)'] # uncomment to see all rows
# Nest (neural simulation tool)	refers to a computer program by Gewaltig and Diesmann. All references refer to the same.

In [109]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'imagenet classification with deep convolutional neural networks']
# All the same reference, Weinberger refers to Eds., In: Pereira, F., Burges, C.J.C., Bottou, L. and Weinberger, K.Q., Eds., Advances in Neural Information Processing Systems, Vol. 25, Curran Associates, Inc., Red Hook, NY, 1097-1105.

In [110]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'very deep convolutional networks for large-scale image recognition']
# All the same, authors are Simonyan and Zisserman

In [111]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'adam: a method for stochastic optimization']
# All the same, authors are Diederik P. Kingma, Jimmy B

In [112]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'learning from the past: approaches for reproducibility in computational neuroscience']
# All the same

In [113]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'a logical calculus of the ideas immanent in nervous activity']
# All the same

In [114]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'every good regulator of a system must be a model of that system']
# All the same

In [115]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'pynn: a common interface for neuronal network simulators']
# All the same

In [116]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'introducing the neuroscience gateway']
# All the same, first author's name is Subhashini Sivagnanam

In [117]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'nineml: the network interchange for neuroscience modeling language']
# All the same

In [118]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'modified multi-layered model of temperature dependent motor nerve axons']
# All the same

In [119]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'deep learning']

# LeCun keeps item_id_clear

# Rusk
rusk = [6862, 23667, 46335, 46709, 69750, 71397, 75506, ]
cn_refs2.loc[rusk, 'item_id_clear'] = 'WOS:000367463600021' 


# Goodfellow
goodfellow = [21838, 28534, 62622, 62746, 79367, 78941]
cn_refs2.loc[goodfellow, 'item_id_clear'] = 'WOS:000412476200021' 

# Hof
cn_refs2.loc[62669, 'item_id_clear'] = '000641587300002.138'
#cn_refs2.loc[cn_refs2['title_lower'] == 'deep learning'] # Check if changes applied

In [120]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'empirical models of spiking in neural populations']
# All the same

In [121]:
# cn_refs2.loc[cn_refs2['title_lower'] == 'learning phrase representations using rnn encoder-decoder for statistical machine translation']
# All the same

In [122]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'models and methods for investigation of the human motor nerve fibre']
# All the same

In [123]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'sequence to sequence learning with neural networks']
# All the same, authors are Ilya Sutskever, Oriol Vinyals, Quoc V. Le

In [124]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'a survey of neuromorphic computing and neural networks in hardware']
# All the same, authors are Catherine D. Schuman, Thomas E. Potok, Robert M. Patton, J. Douglas Birdwell, Mark E. Dean, Garrett S. Rose, James S. Plank

In [125]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'the open source brain initiative: enabling collaborative modelling in computational neuroscience']
# All the same

In [126]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'analysis of neural excitability and oscillations']
# All the same

In [127]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'a theory of pavlovian conditioning: variations in the effectiveness of reinforcement and nonreinforcement']
# All the same

From now on, only rather generic or very short titles are checked manually.

In [128]:
#dfdupcn # Open for full view of references lower case titles with multiple ids

In [129]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'neural turing machines']
# Same, authors are Graves, Alex, Greg Wayne, and Ivo Danihelka

In [130]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'the neuron simulation environment'] #
# All the same

In [131]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'cable theory for dendritic neurons'] 
# Same, Idan Segev is an editor

In [132]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'computational psychiatry']

# Montague keeps item_id_clear

# Wang
wang = [11260, 23169, 23523, 35120, 35474, 37752, 43005, 43966, 46238, 46325,
        48018, 60375, 70848]
cn_refs2.loc[wang, 'item_id_clear'] = 'WOS:000344168100016'
        

# Redish
cn_refs2.loc[30212, 'item_id_clear'] = '000530893806005.20'

# Huys
# hard to find, in Zeitschrift für Psychiatrie, Psychologie und Psychotherapie (2017), 65 (1), 21–26, DOI 10.1024/1661-4747/a000297
cn_refs2.loc[52370, 'item_id_clear'] = 'WOS:000392303100003	'

#cn_refs2.loc[cn_refs2['title_lower'] == 'computational psychiatry'] # Check if changes applied

In [133]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'brian simulator'] # yes, it is the brian simulator, not the brain simulator
# All the same

In [134]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'computing with spiking neuron networks']
# all the same

In [135]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'deep boltzmann machines']
# all the same

In [136]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'deep neural networks in computational neuroscience']
# all the same

In [137]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'openai gym']
# all the same

In [138]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'computational neuroscience']

# Sejnowski keeps item_id_clear

# Eliasmith
cn_refs2.loc[33263, 'item_id_clear'] = 'WOS:000334890900013'
#cn_refs2.loc[cn_refs2['title_lower'] == 'computational neuroscience']

In [139]:
cn_refs2.loc[cn_refs2['title_lower'] == 'adaptive resonance theory']

# Carpenter keeps item_id_clear

# Grossberg
cn_refs2.loc[17621, 'item_id_clear'] = '000346797500001.35'

#cn_refs2.loc[cn_refs2['title_lower'] == 'adaptive resonance theory'] # Check if changes applied


In [140]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'theory of communication']
# All the same

In [141]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'predicting parameters in deep learning']
# All the same

In [142]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'essentials of diagnosis']
# All the same

In [143]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'what is computational neuroscience?']
# All the same

In [144]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'reinforcement learning: an introduction']

# Sutton keeps item_id_clear

# Montague # Actual title was "Review of Reinforcement Learning: An Introduction"!
cn_refs2.loc[54175, 'item_id_clear'] = 'WOS:000082310000008'

#cn_refs2.loc[cn_refs2['title_lower'] == 'reinforcement learning: an introduction'] # Check if changes applied

In [145]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'computational modelling of visual attention']
# All the same

In [146]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'working memory']
# All the same

In [147]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'spiking neurons']
# All the same

In [148]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'neural systems engineering']
# All the same

In [149]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'simple model of spiking neurons']
# All the same

In [150]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'the blue brain project']
# All the same

In [151]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'motivation']

# Butz keeps item_id_clear

# Bargh
cn_refs2.loc[55698, 'item_id_clear'] = '000866873300001.20'

#cn_refs2.loc[cn_refs2['title_lower'] == 'motivation'] # Check if changes applied


In [152]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'fitzhugh-nagumo model'] # Same, Eugene M. Izhikevich and Richard FitzHugh (2006), Scholarpedia, 1(9):1349.

In [153]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'spectral analysis of signals']
# All the same

In [154]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'color vision mechanisms']
# All the same

In [155]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'stdp in recurrent neuronal networks']
# All the same

In [156]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'retinal prosthesis']
# All the same

In [157]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'parkinson\'s disease'] # Take care of the apostrophe

# Lees keeps item_id_clear

# Kalia # doi: 10.1016/S0140-6736(14)61393-3
cn_refs2.loc[54883, 'item_id_clear'] = 'WOS:000360290000033'

#cn_refs2.loc[cn_refs2['title_lower'] == 'parkinson\'s disease'] # Check if changes applied

In [158]:
#cn_refs2.loc[cn_refs2['title_lower'] == '"nestml: a modeling language for spiking neurons,"'] # Mistake in title, should be 'nestml: a modeling language for spiking neurons'

# Assign to correct title and author item_id_clear
cn_refs2.loc[13750, 'item_id_clear'] = '000425314200001.45'
cn_refs2.loc[72519, 'item_id_clear'] = '000425314200001.45'

#cn_refs2.loc[cn_refs2['title_lower'] == '"nestml: a modeling language for spiking neurons,"']

In [159]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'short-term synaptic plasticity']
# All the same

In [160]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'visual map']
# All the same

In [161]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'untitled'] 

# Assign publication of Bailey (2015) to correct title and author item_id_clear 
#cn_refs2.loc[cn_refs2['ref_authors'] == '{"Bailey, J"}']
cn_refs2.loc[19617, 'item_id_clear'] = 'WOS:000362011800012'

# Assign Benabid (2009) to correct title and author item_id_clear
# cn_refs2.loc[cn_refs2['title_lower'] == 'non-animal replacements for acute toxicity testing'] # ? No assignment possible

In [162]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'system identification']
#  All the same

In [163]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'feature visualization']
# All the same

In [164]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'arousal systems']

# Marocco keeps item_id_clear

# Jones
cn_refs2.loc[61740, 'item_id_clear'] = 'WOS:000182352300088'

#cn_refs2.loc[cn_refs2['title_lower'] == 'arousal systems'] # Check if changes applied

In [165]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'guillain-barre syndrome']

# Kuwabara keeps item_id_clear

# Younger reference is by Nortina Shahrizaila, Helmar C Lehmann, Satoshi Kuwabara
cn_refs2.loc[24264, 'item_id_clear'] = 'WOS:000633010800023'

#cn_refs2.loc[cn_refs2['title_lower'] == 'guillain-barre syndrome'] # Check if changes applied

In [166]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'attention and performance']

# Pashler keeps item_id_clear

# Allport
cn_refs2.loc[54545, 'item_id_clear'] = '000621703300032.1'

#cn_refs2.loc[cn_refs2['title_lower'] == 'attention and performance'] # Check if changes applied

In [167]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'schizophrenia']

# Mueser keeps item_id_clear

# Kahn
cn_refs2.loc[62564, 'item_id_clear'] = 'WOS:000381347500001'

#cn_refs2.loc[cn_refs2['title_lower'] == 'schizophrenia'] # Check if changes applied

In [168]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'calcium oscillations']

# Thul keeps item_id_clear

# Dupont
cn_refs2.loc[46846, 'item_id_clear'] = 'WOS:000287846200003'

#cn_refs2.loc[cn_refs2['title_lower'] == 'calcium oscillations'] # Check if changes applied

In [169]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'on random graphs']
# All the same

In [170]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'spike sorting']
# All the same

In [171]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'random forests']
# All the same

In [172]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'hebbian plasticity']
# All the same

In [173]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'nan'] # no nans left


In [174]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'color appearance']

# Judd keeps item_id_clear

# Shevell # probably refers to Brainard, D. H. (2003). Color Appearance and Color Difference Specification. In S. K. Shevell (Ed.), The science of color (pp. 191–216). Elsevier. https://doi.org/10.1016/B978-044451251-2/50006-4
# or to Book Chapter 57: Color Appearance By Kenneth Knoblauch , Steven K. Shevell DOI: https://doi.org/10.7551/mitpress/7131.003.0067, 2003
cn_refs2.loc[51124, 'item_id_clear'] = '000339052800001.33'

#cn_refs2.loc[cn_refs2['title_lower'] == 'color appearance'] # Check if changes applied


In [175]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'neuronal noise']
# All the same

In [176]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'mechanisms in science']
# All the same

In [177]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'human brain project']

# Website keeps item_id_clear

# Grillner
cn_refs2.loc[60624, 'item_id_clear'] = 'WOS:000382270400002'
cn_refs2.loc[68636, 'item_id_clear'] = 'WOS:000382270400002'

#cn_refs2.loc[cn_refs2['title_lower'] == 'human brain project'] # Check if changes applied

In [178]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'social anxiety disorder']

# Stein keeps item_id_clear

# Leichsenring
cn_refs2.loc[63237, 'item_id_clear'] = 'WOS:000402798200009'

#cn_refs2.loc[cn_refs2['title_lower'] == 'social anxiety disorder'] # Check if changes applied

In [179]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'ideational apraxia']

# Poeck keeps item_id_clear

# Derenzi , doi: 10.1093/brain/111.5.1173 
cn_refs2.loc[65784, 'item_id_clear'] = 'WOS:A1988Q907300011'

#cn_refs2.loc[cn_refs2['title_lower'] == 'ideational apraxia'] # Check if changes applied

In [180]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'spike-timing dependent plasticity']
# All the same

In [181]:
#cn_refs2.loc[cn_refs2['title_lower'] == 'mental causation']

# Yablo keeps item_id_clear

# Bennett
cn_refs2.loc[71465, 'item_id_clear'] = 'WOS:000214594900014'

#cn_refs2.loc[cn_refs2['title_lower'] == 'mental causation'] # Check if changes applied

In the next step, columns that do not hold useful information are dropped.

In [182]:
cn_refs2.head()

# drop useless columns that contain mostly nans
cn_refs2 = cn_refs2.drop(columns=['scopus_ref_issue', 
                        'wos_ref_article_number', 
                        'scopus_ref_text', 
                        'scopus_ref_fulltext', 
                        'wos_citation_context'])
cn_refs2.head()

Unnamed: 0,item_id_citing,item_id_cited,citing_pubyear,ref_seq_nr,ref_pubyear,ref_item_title,ref_source_title,ref_authors,ref_volume,ref_pages,ref_doi,title_lower,item_id_clear
0,WOS:000086279300008,WOS:A1975V163400004,2000,1,1975.0,CORTEX OF CEREBELLUM,SCIENTIFIC AMERICAN,"{""LLINAS, RR""}",232.0,56.0,,cortex of cerebellum,WOS:A1975V163400004
1,WOS:000086279300008,WOS:000086279300008.22,2000,2,1890.0,,PRINCIPLES PSYCHOL,"{""JAMES W""}",,,,,WOS:000086279300008.22
2,WOS:000086279300008,WOS:A1993KU17600060,2000,3,1993.0,THE NEURAL CORRELATES OF THE VERBAL COMPONENT ...,NATURE,"{""PAULESU, E""}",362.0,342.0,,the neural correlates of the verbal component ...,WOS:A1993KU17600060
3,WOS:000086279300008,WOS:A1993MR37000021,2000,4,1993.0,MEMORY - VERBAL AND VISUAL SUBSYSTEMS OF WORKI...,CURRENT BIOLOGY,"{""BADDELEY, AD""}",3.0,563.0,,memory - verbal and visual subsystems of worki...,WOS:A1993MR37000021
4,WOS:000086279300008,WOS:A1995RP75600044,2000,5,1995.0,MODULATION OF MEMORY FIELDS BY DOPAMINE D1 REC...,NATURE,"{""WILLIAMS, GV""}",376.0,572.0,,modulation of memory fields by dopamine d1 rec...,WOS:A1995RP75600044


The 13 columns left are:

item_id_citing, item_id_cited, citing_pubyear, ref_seq_nr, ref_pubyear, ref_item_title, ref_source_title, ref_authors, ref_volume, ref_pages, ref_doi, title_lower, item_id_clear

In [183]:
cn_refs2.shape # (81694, 13)

(81694, 13)

Before applying any filter thresholds of minimum cited, the dataframe holds 81,694 entries.

In [184]:
cn_refs2.title_lower.nunique() # 49802, drops na as default
cn_refs2.item_id_cited.nunique() # 59884, drops na as default
cn_refs2.item_id_clear.nunique() # 58722, drops na as default

58722

A first filter is applied to the dataframe: only references that are cited more than once are kept. This is done to avoid any references that are only cited once and therefore cannot be connected to any other paper.

In [185]:
len(cn_refs2) #81694

81694

In [186]:
series_cn_refs2 = cn_refs2.groupby("item_id_clear")['item_id_citing'].nunique() # Group the cited id by the citing id
#print(series_cn_refs2) 

# Cited items with more than one citing item
above1_cn_refs2 = series_cn_refs2.loc[series_cn_refs2.values > 1]
#print(above1_cn_refs2) 

# Show positions of the cited items with more than one citing item
cn_refs2 = cn_refs2.loc[cn_refs2.item_id_clear.isin(above1_cn_refs2.index)].reset_index(drop=True) # Remember that the index is being reset!!

len(cn_refs2) # 32592

32592

In [187]:
cn_refs2.head()

Unnamed: 0,item_id_citing,item_id_cited,citing_pubyear,ref_seq_nr,ref_pubyear,ref_item_title,ref_source_title,ref_authors,ref_volume,ref_pages,ref_doi,title_lower,item_id_clear
0,WOS:000086279300008,WOS:A1995RP75600044,2000,5,1995.0,MODULATION OF MEMORY FIELDS BY DOPAMINE D1 REC...,NATURE,"{""WILLIAMS, GV""}",376,572,,modulation of memory fields by dopamine d1 rec...,WOS:A1995RP75600044
1,WOS:000086279300008,WOS:A1996VV46700007,2000,7,1996.0,Regional and cellular fractionation of working...,PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCE...,"{""GoldmanRakic, PS""}",93,13473,,regional and cellular fractionation of working...,WOS:A1996VV46700007
2,WOS:000086279300008,WOS:A1985ARE2300019,2000,14,1985.0,PRIMATE FRONTAL EYE FIELDS .2. PHYSIOLOGICAL A...,JOURNAL OF NEUROPHYSIOLOGY,"{""BRUCE, CJ""}",54,714,,primate frontal eye fields .2. physiological a...,WOS:A1985ARE2300019
3,WOS:000086279300008,WOS:A1980JC97100014,2000,15,1980.0,DISSOCIATION OF VISUAL AND SACCADE-RELATED RES...,JOURNAL OF NEUROPHYSIOLOGY,"{""MAYS, LE""}",43,207,,dissociation of visual and saccade-related res...,WOS:A1980JC97100014
4,WOS:000086279300008,WOS:A1985ADU5200001,2000,16,1985.0,PRIMATE FRONTAL EYE FIELDS .1. SINGLE NEURONS ...,JOURNAL OF NEUROPHYSIOLOGY,"{""BRUCE, CJ""}",53,603,,primate frontal eye fields .1. single neurons ...,WOS:A1985ADU5200001


Saving the cleaned references dataframe as csv file

In [188]:
cn_refs2.to_csv('cn_refs_cleaned.csv', index=False)