In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
papers = pd.concat([
  pd.read_pickle('/disks/qnap3/shared/scopus-22/data/paper_detail/num_cited.pickle'),
  pd.read_pickle('/disks/qnap3/shared/scopus-22/data/paper_detail/doi.pickle'),
  pd.read_pickle('/disks/qnap3/shared/scopus-22/data/paper_detail/title.pickle'),
  pd.read_pickle('/disks/qnap3/shared/scopus-22/data/paper_detail/journal.pickle'),
  # pd.read_pickle('/disks/qnap3/shared/scopus-22/data/paper_detail/authids.pickle'),
  pd.read_pickle('/disks/qnap3/shared/scopus-22/data/paper_detail/year.pickle'),
], axis = "columns")
print(papers.head())

papers = papers[papers["doi"]!="NO DOI"]
papers = papers[papers["journal"]!=0]
journal_names = pd.read_pickle('/disks/qnap3/shared/scopus-22/data/id_names/journal_info.pickle')
journal_names.head()

     source                                                doi  \
107    12.0    10.1175/1520-0442(1997)010<2184:ASAIUF>2.0.CO;2   
110     6.0                                  10.1063/1.1285965   
116    32.0                            10.1103/PhysRevE.56.623   
117    77.0                                     10.1086/304826   
118    55.0  10.1002/(SICI)1521-4109(199907)11:9<623::AID-E...   

                                                 title      journal  year  
107  Assessing surface-atmosphere interactions usin...        13550  1997  
110  Symmetry of quantum phase space in a degenerat...        27430  2000  
116  Friction in strongly confined polymer melts: E...  21100855841  1997  
117  Nearby young dwarf galaxies: Primordial gas an...        26765  1997  
118  Determination of iodate in salt samples with a...        23979  1999  


Unnamed: 0_level_0,name,country
journalid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,NO SOURCENAME,NO SOURCECOUNTRY
12000,Journal of Technology in Counseling,usa
12001,Journal of the Experimental Analysis of Behavior,NO SOURCECOUNTRY
12002,Journal of the History of the Behavioral Sciences,NO SOURCECOUNTRY
12003,Reconstructionist,usa


In [30]:
# journalidの取得
journal_names[journal_names['name'].isin([''])]                             

Unnamed: 0,journalid,name,country


In [4]:
journal_subset = [
  # Disciplinal
  # Politics
  # 15557 # APSR
  # 20333 #	Journal of Politics	gbr
  # 15555	# American Journal of Political Science	gbr
  # 25725	# World Politics	NO SOURCECOUNTRY
  # 13447	# Comparative Political Studies	usa

  # Physics 
  # 29143,85318,29465,29459, # Phisical Review A-D
  # 21100874237,21100874236,21100829284,21100779241, # Physical Review A-D
  # 29150, # Phisical Review Letters
  
  24222, #Scientometrics
  5100155103, #Journal of Informetrics
  22900, # Research Policy
  22894, # Research Evaluation
  21101062805, #Quantitative Science Studies
  14726, # Technovation
  
  # # General
  # 21100838541, # Nature Human Behaviour
  # 19700182758, # Nature Communications
  # 23571, # Science
  # 21121 # Proceedings of the National Academy of Science
] 

papers_subset = papers[papers["journal"].isin(journal_subset)].fillna(0)
papers_subset_valid = papers_subset[(papers_subset["source"]>0) & (papers_subset["doi"]!="NO DOI")]



In [5]:
papers_subset

Unnamed: 0,source,doi,title,journal,year
3581,95.0,10.1016/S0048-7333(99)00120-1,Overseas innovations by Japanese firms: An ana...,22900,2001
3783,89.0,10.1016/S0048-7333(98)00063-8,The entry mode choice of MNEs: An evolutionary...,22900,1998
4368,42.0,10.1016/S0048-7333(98)00076-6,Technical change and incorporated R & D in the...,22900,1998
13307,13.0,10.1016/0048-7333(95)00876-4,Features of policy-making processes in Japan's...,22900,1996
14932,119.0,10.1007/BF02458472,A compendium of issues for citation analysis,24222,1999
...,...,...,...,...,...
23176505,0.0,10.1016/0048-7333(87)90003-5,Environmental research in Israel: On the need ...,22900,1987
24053128,0.0,10.1016/0166-4972(88)90019-3,Innovation and technical change in the U.S. st...,14726,1988
24717519,0.0,10.1016/0166-4972(89)90012-6,Flexible production automation: A description ...,14726,1989
24752856,0.0,10.1016/0166-4972(89)90030-8,Holography-a current perspective,14726,1989


## ちあきんぐコード

In [7]:
groups = papers_subset_valid.groupby(["journal","year"])["source"].mean()

journal_year_normalized = groups.reset_index()
relative_citation = pd.merge(papers_subset_valid,\
                               journal_year_normalized,\
                               on=["journal","year"])
weights = relative_citation["source_x"].values/relative_citation["source_y"].values
relative_citation = relative_citation.assign(weights=weights)
relative_citation.head()

Unnamed: 0,source_x,doi,title,journal,year,source_y,weights
0,95.0,10.1016/S0048-7333(99)00120-1,Overseas innovations by Japanese firms: An ana...,22900,2001,146.690476,0.647622
1,155.0,10.1016/S0048-7333(00)00147-5,The changing composition of innovative activit...,22900,2001,146.690476,1.056647
2,184.0,10.1016/S0048-7333(00)00098-6,Location and network effects on innovation suc...,22900,2001,146.690476,1.254342
3,129.0,10.1016/S0048-7333(00)00095-0,Market- and committee-based mechanisms in the ...,22900,2001,146.690476,0.879403
4,185.0,10.1016/S0048-7333(99)00102-X,Determinants of location of overseas R&D activ...,22900,2001,146.690476,1.261159


In [8]:
sample_papers = relative_citation.sample(n=3, weights=relative_citation["weights"])
for v in sample_papers.iterrows():
  print(v[1]["title"])
  print(f"   index: {v[1]['weights']:.3}")
  print(f"   published in {v[1]['year']}")
  print(f"   https://doi.org/"+ v[1]["doi"])

Who are the researchers that are collaborating with industry? An analysis of the wine sectors in Chile, South Africa and Italy
   っs index: 0.955
   published in 2010
   https://doi.org/10.1016/j.respol.2010.03.007
R&D evaluation in portugal
   っs index: 0.422
   published in 1995
   https://doi.org/10.1093/rev/5.1.89
Diffusion and scale dynamics: A case study
   っs index: 0.171
   published in 1982
   https://doi.org/10.1016/0166-4972(82)90009-8


## Century of Physics の再現

### Sci Sci専門誌によるラベル付け

### 引用文献データの読み込み

In [9]:
references = pd.read_pickle('/disks/qnap3/shared/scopus-22/data/citations_gb.pickle')

In [32]:
references.head()

Unnamed: 0,target,source
107,"[27808141, 27079122, 29503365, 84967885639, 29...","[141593895, 33750273488, 34210912, 36217526, 8..."
110,"[4243070017, 3342986001, 4243966752, 424370794...","[85036201347, 37649031574, 85037178788, 269444..."
116,"[742325303, 12554168, 12129029, 36449004335, 3...","[60349112291, 77956435401, 705988, 31444444683..."
117,"[24528508, 4243269019, 6144227006, 160235, 250...","[1177552, 36682633, 84890909098, 23044451834, ..."
118,"[21393006, 27960667, 33845277887, 84987467001,...","[84863076513, 84944735625, 33947636620, 787515..."


In [35]:
references1 = references[['target']]
references1.head()

Unnamed: 0,target
107,"[27808141, 27079122, 29503365, 84967885639, 29..."
110,"[4243070017, 3342986001, 4243966752, 424370794..."
116,"[742325303, 12554168, 12129029, 36449004335, 3..."
117,"[24528508, 4243269019, 6144227006, 160235, 250..."
118,"[21393006, 27960667, 33845277887, 84987467001,..."


In [34]:
references.shape

(66870421, 2)

### 引用文献中の割合

#### nullの取得

In [13]:
journal_count = papers.groupby(['journal']).size()
journal_count = pd.DataFrame(journal_count, columns=['count']).reset_index()
all_size = journal_count['count'].sum()
journal_count['null_ratio'] = journal_count['count']/all_size

journal_count.head()

Unnamed: 0,journal,count,null_ratio
0,12001,3445,5.8e-05
1,12002,1133,1.9e-05
2,12004,798,1.3e-05
3,12005,2680,4.5e-05
4,12006,3321,5.6e-05


In [14]:
# SciSciジャーナル
journal_count[journal_count["journal"].isin(journal_subset)]

Unnamed: 0,journal,count,null_ratio
1737,14726,2214,3.7e-05
6694,22894,852,1.4e-05
6697,22900,3631,6.1e-05
7528,24222,6476,0.000109
19605,5100155103,1156,1.9e-05
287191,21101062805,151,3e-06


In [15]:
scisci_ratio = journal_count[journal_count["journal"].isin(journal_subset)]['count'].sum()/all_size

scisci_ratio 

0.00024389209472620738

#### 引用文献中のSci Sci論文の比率の取得

In [16]:
#トップ総合誌

genreral_journal_subset = [
  # # General
  21100838541, # Nature Human Behaviour
  19700182758, # Nature Communications
  23571, # Science
  21121 # Proceedings of the National Academy of Science
] 

In [17]:
target_papers = papers[papers["journal"].isin(genreral_journal_subset)].fillna(0)
target_papers.head()

Unnamed: 0,source,doi,title,journal,year
1452,65.0,10.1126/science.179.4070.298,Rickettsia-like bacterium associated with Pier...,23571,1973
1706,68.0,10.1126/science.246.4930.649,X-ray diffraction to 302 gigapascals: High-pre...,23571,1989
3575,163.0,10.1126/science.218.4577.1082,Pathfinding by peripheral pioneer neurons in g...,23571,1982
3605,46.0,10.1073/pnas.86.20.7814,Isolation of the α subunits of GTP-binding reg...,21121,1989
4283,28.0,10.1126/science.187.4182.1193,True polar wander since the early cretaceous,23571,1975


In [18]:
target_papers.index.max()

85129306996

In [19]:
target_papers_eid_ls = target_papers.index.values

In [20]:
scisci_paper_eid_ls = papers_subset.index.values

In [21]:
references.loc[107]

target    [27808141, 27079122, 29503365, 84967885639, 29...
source    [141593895, 33750273488, 34210912, 36217526, 8...
Name: 107, dtype: object

In [25]:
ref_dict = references1.to_dict()['target']

In [24]:
ref_dict['target']

{107: [27808141,
  27079122,
  29503365,
  84967885639,
  29471938,
  29538656,
  27063593,
  29751999,
  25584973,
  27789586,
  27088787,
  84977382783,
  28182096,
  22172279,
  30377402,
  31399752,
  28881395,
  28869199,
  27091260,
  26273018,
  21844515741,
  27085408,
  28058226,
  26050570,
  12244249820],
 110: [4243070017,
  3342986001,
  4243966752,
  4243707944,
  4243736648,
  39970492,
  347739045,
  41998611,
  39846060,
  1095332,
  1840092],
 116: [742325303,
  12554168,
  12129029,
  36449004335,
  33749178498,
  1462840,
  30147797,
  30083674,
  29754167,
  28210135,
  3543014842,
  70349942189,
  28521716,
  29250471,
  28494972,
  36549100911,
  36449007230,
  5915389,
  1721953,
  85040214461,
  4243180110,
  642371520],
 117: [24528508,
  4243269019,
  6144227006,
  160235,
  25099716,
  79960847910,
  85052118421,
  1931938,
  2981482,
  21744442565,
  21744442599,
  21444441814,
  21344461137,
  400462,
  3242864449,
  1541154,
  12044257872,
  21744458063,


In [37]:
for eid in target_papers.index.values:
    if eid not in references.index.values:
        print("no")
    else:
        print(references.loc[eid])

target                                                  NaN
source    [35649012446, 33744544421, 33748050787, 800521...
Name: 1452, dtype: object
target                                                  NaN
source    [84953486796, 60449091068, 29707780, 1350435, ...
Name: 1706, dtype: object
target                                                  NaN
source    [25019629, 22351298, 84902357224, 23968402, 34...
Name: 3575, dtype: object
target                                                  NaN
source    [27331330, 27981982, 29810763, 28180952, 26694...
Name: 3605, dtype: object
target                                                  NaN
source    [11386892, 20973194, 85009186343, 41649117522,...
Name: 4283, dtype: object
target    [33392, 220271, 1544088, 36849161374, 1518950,...
source    [30295108, 33529583, 35348928119, 345003805, 2...
Name: 4334, dtype: object
target                                                  NaN
source    [85086355638, 25017237, 26441540, 80455128627,...
Name

KeyboardInterrupt: 

In [27]:
count_ls = []
scisci_ratio_ls = []
# target_papers_eid_ls
for eid in tqdm(papers.index.values):
    if eid in ref_dict.keys():
        target_ref_ls = ref_dict[eid]
        # 特定の数字が含まれているかどうかのブール配列を生成
        mask = np.isin(target_ref_ls, scisci_paper_eid_ls) 
        # ブール配列を使って特定の数字の出現回数の合計を計算
        total_count = np.sum(mask)
        count_ls.append(total_count)
        if 
        scisci_ratio_ls.append(total_count/len(target_ref_ls))
    else:
        count_ls.append(np.nan)
        scisci_ratio_ls.append(np.nan)

  0%|                                 | 14/59370518 [00:00<8:04:48, 2041.02it/s]


TypeError: object of type 'float' has no len()

In [29]:
np.nan == target_ref_ls

False

In [None]:
count_ls = []
scisci_ratio_ls = []
# target_papers_eid_ls
for eid in tqdm(papers.index.values):
    if eid in ref_dict.keys():
        target_ref_ls = ref_dict[eid]
        # 特定の数字が含まれているかどうかのブール配列を生成
        mask = np.isin(target_ref_ls, scisci_paper_eid_ls) 
        # ブール配列を使って特定の数字の出現回数の合計を計算
        total_count = np.sum(mask)
        count_ls.append(total_count)
        scisci_ratio_ls.append(total_count/len(target_ref_ls))
    else:
        count_ls.append(np.nan)
        scisci_ratio_ls.append(np.nan)

In [117]:
count_ls

[nan,
 nan,
 nan,
 nan,
 nan,
 0,
 nan,
 nan,
 0,
 0,
 nan,
 nan,
 nan,
 0,
 0,
 0,
 nan,
 nan,
 nan,
 0,
 0,
 nan,
 nan,
 nan,
 nan,
 nan,
 0,
 nan,
 0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0,
 nan,
 0,
 nan,
 0,
 nan,
 nan,
 nan,
 nan,
 0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0,
 nan,
 0,
 nan,
 nan,
 nan,
 nan,
 nan,
 0,
 nan,
 nan,
 nan,
 0,
 nan,
 nan,
 0,
 nan,
 0,
 nan,
 nan,
 0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0,
 nan,
 nan,
 0,
 0,
 nan,
 0,
 nan,
 nan,
 nan,
 nan,
 0,
 nan,
 0,
 nan,
 nan,
 nan,
 nan,
 nan,
 0,
 0,
 0,
 nan,
 nan,
 nan,
 0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0,
 0,
 nan,
 nan,
 nan,
 0,
 nan,
 nan,
 0,
 nan,
 nan,
 0,
 nan,
 nan,
 nan,
 nan,
 nan,
 0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0,
 nan,
 0,
 nan,
 nan,
 0,
 nan,
 n

In [116]:
scisci_ratio_ls

[nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 0.0,
 0.0,
 nan,
 nan,
 nan,
 0.0,
 0.0,
 0.0,
 nan,
 nan,
 nan,
 0.0,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 0.0,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 0.0,
 nan,
 0.0,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 0.0,
 0.0,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 0.0,
 0.0,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 0.0,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 0.0,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan

In [86]:
import numpy as np

# numpy配列の定義
arr = np.array([1, 2, 2, 3, 4, 5, 6, 2, 3, 5, 5])

# 特定の数字のリスト
targets = [2, 3, 5]

# 特定の数字が含まれているかどうかのブール配列を生成
mask = np.isin(arr, targets)

# ブール配列を使って特定の数字の出現回数の合計を計算
total_count = np.sum(mask)

# 合計の出現回数を表示
print(f"数字 {targets} の合計出現回数: {total_count}")


数字 [2, 3, 5] の合計出現回数: 8


In [87]:
mask

array([False,  True,  True,  True, False,  True, False,  True,  True,
        True,  True])