# 論文分類

In [1]:
import pandas as pd
from urllib.request import urlopen
from urllib.error import HTTPError
import json 
import numpy as np

## 1. 讀取原始資料

條目總數：1331 筆

In [2]:
fname = '/home/jovyan/Projects/Workbench/TW_parachute/6.1331_TW_others_Zheng.xlsx'
df = pd.read_excel(fname)

## 2. 檢視有 DOI 的資料

條目總數：1014 筆

In [3]:
df_has_doi = df.dropna(subset=['DI'])
len(df_has_doi)

1014

### 2.1 由重複 DOI 挑出重複的條目

In [4]:
values, counts = np.unique(df_has_doi['DI'], return_counts=True)

# 沒有超過兩筆資料的 DOI 是相同的
values[np.where(counts > 2)]

array([], dtype=object)

比對重複 DOI 條目資料的相似度後，挑出重複的條目 (`list_r`)

In [5]:
list_r = []
for repetitive_doi in values[np.where(counts == 2)]:
    repetitive_entries = df_has_doi.loc[df_has_doi['DI'] == repetitive_doi]
    print('{}, {}'.format(repetitive_entries.index[0], repetitive_entries.iloc[0]['TI']))
    print('{}, {}'.format(repetitive_entries.index[1], repetitive_entries.iloc[1]['TI']))
    list_r.append(repetitive_entries.index[1])

1210, Meteor radar wind over ChungLi 249N 121E Taiwan for the period 1025 November 2012 which includes Leonid meteor shower Comparison with empirical model and satellite measurements
1220, Meteor radar wind over ChungLi 249 degrees N 121 degrees E Taiwan for the period 1025 November 2012 which includes Leonid meteor shower Comparison with empirical model and satellite measurements
1245, Paleoclimatological and paleoenvironmental records since 4 000 a B P in sediments of alpine lakes in Taiwan
1246, Paleoclimatological and paleoenvironmental records since 4000 a BP in sediments of alpine lakes in Taiwan
754, A first modeling of dynamic and static crustal strain field from nearfield dilatation measurements example of the 2013 Mw 62 Ruisui earthquake Taiwan
766, A first modeling of dynamic and static crustal strain field from nearfield dilatation measurements example of the 2013 62 Ruisui earthquake Taiwan
735, Assessment of flood mitigation through riparian detention in response to a cha

以下的條目有不同的 doi，但實際上兩兩屬於同一篇論文。把重複條目的其中之一手動加到 `list_r`。

In [6]:
df_has_doi.loc[[290, 291, 304, 305]]

Unnamed: 0,PY,JI,TI,AU,AB,C1,AU_CO,AU1_CO,AUc_CO,N_AU,...,DI,DT,ID,PU,SO,TC,VL,DB,AU_UN,AUc_UN
290,2001.0,Geol. Mag.,Structural evolution and significance of a mél...,Chang C;Angelier J;Huang C;Liu C,The analysis of 'mélanges' of various types (s...,"Laboratoire de Tectonique Quantitative, Univer...",FRANCE;TAIWAN;TAIWAN;TAIWAN;TAIWAN,FRANCE,FRANCE,4,...,10.1017/s0016756801005970,Article,Deformation; Faulting; Landforms; Sedimenta...,Cambridge University Press,Geological Magazine,57,138,scopus,UNIVERSITÉ P. AND M. CURIE;NATIONAL TAIWAN UNI...,LABORATOIRE TECTONIQUE QUANTITATIVE
291,2001.0,Geol. Mag.,Structural evolution and significance of a mel...,Chang C;Angelier J;Huang C;Liu C,The analysis of 'melanges' of various types (s...,"Univ Paris 06, Lab Tecton Quantitat, F-75252 P...",FRANCE;TAIWAN;TAIWAN,FRANCE,FRANCE,4,...,10.1017/S0016756801005970,Article,LONGITUDINAL VALLEY; SUTURE ZONE; FAULT; PLATE...,CAMBRIDGE UNIV PRESS,GEOLOGICAL MAGAZINE,56,138,ISI,FRANCE.;NATL TAIWAN UNIV;NATL TAIWAN UNIV,
304,1996.0,Geology,Extensional collapse of the northern Taiwan mo...,Teng L,As an active collision zone between the Luzon ...,,,,TAIWAN,1,...,10.1130/0091-7613(1996)024<0949:ECOTNT>2.3.CO;2,Article,BACK-ARC BASIN; OKINAWA TROUGH; COLLISION; LIT...,GEOLOGICAL SOC AMERICA,GEOLOGY,205,24,ISI,,NATL TAIWAN UNIV
305,1997.0,Geology,Extensional collapse of the northern Taiwan mo...,Chen C,,,,,TAIWAN,1,...,10.1130/0091-7613(1997)025<0855:ECOTNT>2.3.CO;2,Article,ARC; BASALTS; ISLAND,GEOLOGICAL SOC AMERICA,GEOLOGY,14,25,ISI,,INST EARTH SCI


In [7]:
if 291 not in list_r:
    list_r.append(291)
if 305 not in list_r:
    list_r.append(305)

### 2.2 為剩下的條目分類

- `list_a`: A 類   (非目標主題)
- `list_b1`: B1 類 (目標主題, 使用公開或可經付費取得之資料, 作者群中至少一人任職於台灣機構)
- `list_b2`: B2 類 (目標主題, 使用非公開、需正式申請或透過科研合作才可取得之資料, 作者群中至少一人任職於台灣機構)
- `list_c1`: C1 類 (目標主題, 使用公開或可經付費取得之資料, 作者群中無人任職於台灣機構)

#### 分類的細節 (my rubrics)

1. 目標主題：**固體地球、物理海洋學、古生物與古環境研究**，地理區域涵蓋**台灣地區** (中華民國實際管轄陸域及周圍十二海浬 (22.2 km) 水域，包括台灣島、基隆北方三島、龜山島、金門列島、馬祖列島、澎湖群島、小琉球、綠島、蘭嶼、東沙島、太平島)。 
2. 以下相關領域**算做目標主題**：

  - 研究現代生物作用作為古環境研究框架的研究（花粉學、珊瑚定年等等）
  - 洪水與土石流、泥流相關研究
  - 火山學（即使採集的只有氣體）
  - 與地震或構造運動相關的水文學研究（地下水流量變化等等）

3. 以下相關領域**不算目標主題**：

  - 生態學
  - 人類學（考古遺址、鑑種等等）
  - 漁業學
  - 地球科學史、科學教育與傳播研究
  - 純論測地學技術之研究（GNSS、重力等等）
  - 以環境汙染為主題之研究（PM2.5、水汙染、土壤汙染等等）
  - 氣象學
  - 純論地理資訊系統與地理資料科學之研究（Geoinformatics）
  - 社會學、社會地理學、地緣政治學
  - 公共衛生相關研究
  - 都市規劃、城鄉發展、土地利用、環境社會學、農業與環境的互動
  - 利用遙測資料進行的生物圈研究（NDVI、樹冠層厚度等等）
  - 自然災害應變系統
  - 以水資源利用或污染傳播作為焦點的水文學研究
  - 以工程建設（水壩）或汙染/有機物質傳播作為焦點的河流沉積物研究
  - 核電廠相關之環境科學

4. 文章使用的**資料類型的判定標準**：

  - 所有資料都以 2022 年 4 月後的開放程度為準。(也就是說，某筆資料在作者撰寫論文的時候可能還是非公開資料，但只要資料在 2022 年能公開或付費獲取，就算做公開資料。)
  - 作者初次在文章中發表、在野外實地取得的資料都算做非公開資料，像是照片、影片、距離/方位量測、溫度、流速等等。
  - 實驗室分析之樣本為非公開資料 (除非是可以買到的樣本，像是商業流通的化石等，不過我這邊沒有這種例子)
  - 只有實際用於科學分析的資料才納入考慮。例如作者使用非公開 DEM 只來當地圖底圖的話，就不算數。
  - 只考慮與台灣地區相關的資料。例如作者是否使用來自台灣以外的岩石樣品不影響資料類型的分類判定。
  - 同時使用公開與非公開的資料的情況下，算做使用非公開資料的文章。
  - 常見的非公開與公開資料庫列表可參考「規則」Google Docs。

In [8]:
list_a = [1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 37, 38, 39, 42, 44, 45, 46, 47, 48, 49, 51, 52, 57, 74, 77, 78, 80, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 
    100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 137, 141, 142, 146, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 177, 178, 180, 185, 187, 190, 192, 193, 197, 198, 199, 
    200, 201, 202, 203, 207, 210, 211, 215, 217, 218, 219, 220, 221, 222, 223, 228, 230, 241, 242, 244, 249, 255, 264, 265, 267, 274, 275, 276, 277, 278, 286, 287, 288, 
    303, 344, 345, 347, 349, 350, 353, 354, 355, 357, 359, 360, 361, 362, 364, 365, 392, 393, 394, 395, 396, 397, 398, 399, 
    400, 401, 402, 403, 417, 423, 424, 425, 426, 427, 428, 429, 431, 432, 433, 434, 437, 438, 439, 440, 441, 442, 443, 444, 448, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 476, 477, 478, 479, 485, 486, 487, 490, 491, 494, 496, 497, 498, 499, 
    500, 501, 502, 503, 504, 505, 506, 507, 510, 514, 523, 537, 538, 540, 553, 554, 561, 564, 568, 570, 574, 578, 581, 582, 583, 587, 588, 589, 590, 593, 599, 
    601, 602, 606, 607, 614, 620, 621, 622, 627, 630, 633, 634, 640, 641, 646, 648, 653, 656, 658, 666, 669, 671, 674, 676, 678, 691, 
    711, 721, 722, 723, 724, 725, 727, 728, 729, 730, 731, 734, 735, 737, 738, 742, 743, 746, 748, 749, 751, 753, 757, 758, 759, 760, 761, 762, 763, 764, 765, 769, 780, 781, 784, 786, 791, 792, 
    884, 885, 886, 887, 888, 889, 890, 892, 899, 
    907, 910, 914, 915, 916, 917, 918, 919, 920, 921, 922, 930, 931, 932, 933, 934, 935, 936, 937, 938, 941, 942, 943, 944, 945, 946, 950, 969, 
    1039, 1040, 1041, 1043, 1045, 1046, 1047, 1048, 1050, 1085, 1090, 1091, 1092, 1093, 1097, 
    1108, 1114, 1115, 1117, 
    1207, 1208, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1221, 1222, 1223, 1224, 1225, 1226, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1244, 1249, 1252, 1253, 1260, 1262, 1263, 1264, 1267, 1270, 1272, 1275, 1276, 1277, 1281, 1282, 1284, 1286, 1287, 1288, 1289, 1291, 1292, 1293, 1294, 
    1305, 1309, 1310, 1311, 1312, 1313, 1315, 1318, 1324, 1326, 1329, 1330]

list_b1 = [33, 59, 66, 83, 95, 121, 130, 181, 184, 186, 204, 208, 216, 272, 289, 295, 298, 
    304, 310, 318, 330, 351, 356, 358, 369, 377, 385, 388, 430, 449, 450, 481, 489, 492, 
    524, 527, 533, 535, 536, 545, 549, 550, 552, 562, 575, 577, 594, 596, 
    605, 609, 611, 613, 619, 623, 628, 635, 637, 638, 642, 649, 650, 651, 652, 659, 662, 663, 665, 672, 675, 677, 686, 687, 697, 
    702, 707, 709, 710, 713, 770, 768, 771, 775, 776, 777, 779, 891, 900, 901, 912, 962, 
    1058, 1102, 1105, 1107, 1243, 1250, 1290, 1295, 1299, 1303, ]

list_b2 = [5, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 40, 41, 50, 53, 58, 60, 62, 64, 67, 69, 71, 72, 73, 75, 76, 79, 81, 82, 84, 85, 86, 87, 
    122, 123, 124, 125, 127, 129, 132, 133, 134, 135, 136, 138, 139, 140, 143, 144, 145, 147, 148, 173, 174, 175, 176, 179, 183, 189, 191, 194, 195, 
    205, 206, 214, 209, 212, 213, 224, 225, 226, 227, 229, 231, 234, 235, 237, 239, 240, 243, 245, 247, 248, 250, 251, 252, 254, 256, 257, 259, 262, 266, 268, 269, 270, 271, 273, 290, 292, 293, 294, 296, 297, 299, 
    300, 301, 309, 311, 312, 313, 314, 316, 319, 321, 322, 323, 324, 325, 326, 327, 328, 329, 331, 332, 333, 334, 335, 336, 337, 338, 339, 342, 343, 348, 352, 363, 366, 367, 368, 370, 371, 372, 373, 374, 375, 376, 378, 379, 380, 381, 382, 383, 384, 386, 387, 389, 391, 
    404, 405, 411, 412, 413, 415, 416, 418, 419, 420, 421, 422, 435, 436, 445, 446, 447, 451, 452, 454, 475, 480, 482, 483, 484, 488, 493, 
    508, 509, 511, 512, 513, 515, 516, 517, 518, 519, 520, 521, 522, 525, 526, 528, 529, 530, 531, 532, 534, 539, 541, 543, 544, 546, 547, 548, 551, 556, 557, 558, 560, 563, 565, 567, 569, 571, 572, 573, 579, 580, 584, 585, 591, 592, 595, 597, 
    603, 604, 608, 610, 615, 616, 617, 618, 624, 625, 626, 629, 631, 636, 639, 643, 645, 647, 654, 655, 657, 660, 661, 664, 667, 668, 670, 673, 679, 680, 681, 682, 683, 684, 685, 688, 689, 690, 692, 693, 694, 695, 698, 699, 
    700, 701, 703, 704, 705, 706, 712, 714, 716, 717, 726, 732, 733, 739, 740, 741, 745, 747, 750, 754, 755, 756, 767, 772, 773, 774, 782, 783, 785, 787, 788, 789, 790, 
    881, 882, 893, 895, 896, 897, 898, 
    902, 903, 904, 905, 906, 908, 909, 911, 913, 923, 924, 925, 926, 927, 929, 939, 940, 951, 952, 953, 955, 956, 957, 958, 960, 961, 963, 964, 965, 966, 967, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 
    1036, 1037, 1038, 1042, 1044, 1049, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1086, 1087, 1088, 1089, 1094, 1095, 1099, 
    1100, 1101, 1103, 1104, 1106, 1109, 1111, 1112, 1113, 
    1227, 1228, 1231, 1245, 1247, 1251, 1255, 1256, 1257, 1261, 1265, 1266, 1268, 1269, 1271, 1273, 1278, 1283, 1285, 1296, 1297, 1298, 
    1300, 1301, 1302, 1304, 1306, 1307, 1320, 1321, ]

list_c1 = [317, ]

### 2.3 驗證

以上全部加起來應該要有 1014 個條目。

In [9]:
y_r = np.array([list_r]).flatten()
y_a = np.array([list_a]).flatten()
y_b1 = np.array([list_b1]).flatten()
y_b2 = np.array([list_b2]).flatten()
y_c = np.array([list_c1]).flatten()

classified_idx = np.sort(np.hstack((y_r, y_a, y_b1, y_b2, y_c)))
len(classified_idx)

1014

所有在 `df_has_doi` 內的條目應該都要被分類到。

In [10]:
classified = df_has_doi.index.isin(classified_idx.astype(np.int64))
not_classified = df_has_doi.loc[~classified]
not_classified

Unnamed: 0,PY,JI,TI,AU,AB,C1,AU_CO,AU1_CO,AUc_CO,N_AU,...,DI,DT,ID,PU,SO,TC,VL,DB,AU_UN,AUc_UN


## 3. 檢視沒有 DOI 的資料

條目總數：317 筆

In [11]:
nodoi_entries = df[df['DI'].isna()]
len(nodoi_entries)

317

### 3.1 分類

- `list2_r`: 重複的條目
- `list_a`: A 類   (非目標主題)
- `list_b1`: B1 類 (目標主題, 使用公開或可經付費取得之資料, 作者群中至少一人任職於台灣機構)
- `list_b2`: B2 類 (目標主題, 使用非公開、需正式申請或透過科研合作才可取得之資料, 作者群中至少一人任職於台灣機構)
- `list_cannot_check`: 無法分類 (無法只憑標題判定是否為目標主題，也找不到文章內文)

In [12]:
list2_r = [56, 306]
# 306 與 304 (有 DOI) 重複

list2_a = [3, 18, 19, 110, 188, 196, 261, 279, 280, 281, 282, 283, 284, 285, 340, 341, 406, 407, 409, 410, 455,
           718, 720, 744, 883, 928, 970, 971, 972, 973, 
           1033, 1118, 1209, 1242, 1259, 1314, 1316, 1317, 1319, 1322, 1323, 1325, 1327, 1328, ]

list2_b1 = [308, 408, 949, ]

list2_b2 = [0, 61, 232, 236, 238, 260, 307, 894, 947, 948, 974, 
            1096, 1248, 1254, 1279, 1280, ]

list2_cannot_check = [54, 55, 233, 453, 719, 793, 794, 795, 796, 797, 798, 799,
                      800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 
                      840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862 ,863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 
                      991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009,
                      1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032,
                      1034, 1035, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 
                      1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 
                      1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199,
                      1200, 1201, 1202, 1203, 1204, 1205, 1206, 
                      1230, ]
# 以下期刊的文章都無法透過 Google 找內文
#     J. Geol. Soc. China
#     Journal - Geological Society of China
#     Memoir - Geological Society of China [除了 1059，這篇文章有重新發表在 Tectonophysics 上，不過為了避免重複計算，這邊仍然算在 cannot_check 內]
#     Proceedings - Geological Society of China

### 3.2 驗證

以上全部加起來應該要有 317 個條目。

In [13]:
n_r = np.array([list2_r]).flatten()
n_a = np.array([list2_a]).flatten()
n_b1 = np.array([list2_b1]).flatten()
n_b2 = np.array([list2_b2]).flatten()
n_na = np.array([list2_cannot_check]).flatten()

classified2_idx = np.sort(np.hstack((n_r, n_a, n_b1, n_b2, n_na)))
len(classified2_idx)

317

所有在 `nodoi_entries` 內的條目應該都要被分類到。

In [14]:
nodoi_entries_classified = nodoi_entries.index.isin(classified2_idx.astype(np.int64))
nodoi_not_classified = nodoi_entries.loc[~nodoi_entries_classified]
nodoi_not_classified

Unnamed: 0,PY,JI,TI,AU,AB,C1,AU_CO,AU1_CO,AUc_CO,N_AU,...,DI,DT,ID,PU,SO,TC,VL,DB,AU_UN,AUc_UN


## 4. 合併以上結果

In [15]:
r = np.sort(np.hstack((y_r, n_r)))     # 重複
a = np.sort(np.hstack((y_a, n_a)))     # A
b1 = np.sort(np.hstack((y_b1, n_b1)))  # B1
b2 = np.sort(np.hstack((y_b2, n_b2)))  # B2
c = y_c     # C1
na = n_na   # 無法分類

In [16]:
print(' 重複: \t {} \n A類: \t {} \n B1類: \t {} \n B2類: \t {} \n C1類: \t {} \n 無法分:\t {}'.format(len(r), len(a), len(b1), len(b2), len(c), len(na)))

 重複: 	 54 
 A類: 	 479 
 B1類: 	 103 
 B2類: 	 442 
 C1類: 	 1 
 無法分:	 252


## 5. 填上資金來源

所有的 B/C 類 (546 筆資料) 都必須要調查研究資金來源。

- `taiwan_funding`: 文章中具體寫出研究收到台灣政府或民間組織的財政支持。**部分支持**與**國際合作計畫**也算。
- `foreign_funding`: 文章中具體寫出研究收到非台灣政府或民間組織的財政支持，並且沒有提及台灣政府或民間組織的資金貢獻。
- `nofunding`: 文章中並未寫出研究是否收到財政支持。

**財政支持只能是以科學計畫獎助或是研究獎(學)金的形式撥款**。派遣野外採集的人力、樣本或資料的傳遞、研究船航次、學術會議主辦國的貢獻等等都不算是對研究的財政支持。

In [17]:
taiwan_funding = [0, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 34, 35, 58, 59, 60, 62, 64, 66, 67, 69, 71, 72, 73, 76, 81, 82, 83, 84, 87, 
    122, 123, 124, 125, 127, 129, 130, 132, 134, 136, 138, 139, 140, 145, 147, 148, 173, 174, 176, 179, 183, 184, 191, 195, 
    204, 205, 206, 208, 209, 212, 214, 216, 224, 225, 229, 231, 232, 234, 235, 236, 237, 238, 239, 240, 243, 245, 247, 248, 250, 251, 252, 257, 262, 266, 268, 269, 271, 272, 293, 290, 292, 296, 298, 299, 
    304, 307, 308, 311, 313, 314, 316, 319, 323, 329, 331, 332, 333, 334, 335, 336, 337, 338, 339, 348, 352, 356, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 381, 382, 383, 384, 385, 386, 387, 388, 
    405, 411, 415, 418, 419, 420, 421, 435, 445, 446, 450, 451, 452, 480, 481, 482, 483, 484, 488, 489, 492, 493, 
    508, 509, 512, 516, 517, 520, 521, 522, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 539, 541, 543, 545, 546, 547, 548, 549, 550, 551, 552, 556, 560, 562, 563, 565, 567, 569, 571, 573, 575, 577, 579, 580, 584, 585, 591, 594, 595, 596, 597, 
    604, 605, 609, 610, 611, 613, 615, 616, 617, 618, 619, 623, 625, 626, 628, 629, 631, 635, 637, 638, 639, 642, 643, 645, 647, 649, 650, 651, 652, 654, 655, 657, 659, 661, 662, 663, 664, 665, 668, 670, 672, 673, 677, 679, 680, 681, 682, 683, 685, 686, 687, 688, 689, 690, 692, 694, 697, 698, 
    700, 701, 702, 703, 704, 705, 706, 707, 709, 710, 712, 713, 714, 716, 717, 726, 733, 739, 740, 741, 745, 747, 750, 754, 767, 768, 771, 773, 775, 777, 779, 785, 787, 881, 882, 893, 894, 895, 896, 
    900, 902, 903, 904, 905, 908, 909, 911, 923, 924, 925, 926, 927, 929, 940, 947, 948, 949, 951, 952, 953, 955, 956, 957, 958, 960, 961, 962, 963, 965, 967, 974, 975, 976, 978, 979, 980, 981, 984, 985, 986, 987, 988, 989, 990, 
    1036, 1037, 1042, 1044, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1088, 1094, 1095, 1099, 1100, 1101, 1102, 1103, 1104, 1106, 1107, 
    1227, 1228, 1255, 1256, 1257, 1261, 1266, 1268, 1269, 1280, 1283, 1290, 1295, 1296, 1297, 1299, 1300, 1302, 1303, 1304, 1306, 1307, 1320, 1321, ]

foreign_funding = [5, 36, 61, 135, 143, 144, 181, 186, 
    213, 226, 254, 256, 270, 294, 297, 300, 318, 321, 322, 324, 326, 327, 330, 358, 363, 389, 391, 
    513, 557, 558, 572, 603, 608, 636, 660, 675, 684, 695, 699, 770, 774, 755, 782, 783, 789, 790, 
    912, 913, 939, 964, 966, 977, 983, 1049, 1087, 1089, 1111, 1112, 1113, 1273, 1298, 1301, ]

nofunding = [29, 33, 40, 41, 50, 53, 75, 79, 85, 86, 95, 
    121, 133, 175, 189, 194, 227, 259, 260, 273, 289, 295, 
    301, 309, 310, 312, 317, 325, 328, 342, 343, 351, 366, 367, 368, 380, 
    404, 408, 412, 413, 416, 422, 430, 436, 447, 449, 454, 475, 
    511, 515, 518, 519, 544, 592, 624, 667, 693, 732, 756, 772, 776, 788, 891, 897, 898, 
    901, 906, 982, 
    1038, 1086, 1096, 1105, 1109, 1231, 1243, 1245, 1247, 1248, 1250, 1251, 1254, 1265, 1271, 1278, 1279, 1285, ]

### 5.1 驗證

以上全部加起來應該要有 546 個條目。

In [18]:
f_t = np.array([taiwan_funding]).flatten()
f_f = np.array([foreign_funding]).flatten()
f_n = np.array([nofunding]).flatten()
f_collection = np.sort(np.hstack((f_t, f_f, f_n)))
len(f_collection)

546

所有在 B 或 C 類內的條目應該都要被分類到。

In [19]:
bc_collection = np.sort(np.hstack((b1, b2, c)))
mask = np.isin(bc_collection, f_collection)
bc_collection[~mask]

array([], dtype=int64)

## 6. 輸出結果

In [20]:
# 把資料複製到 df_out 後再開始更動
df_out = df.copy()

In [21]:
df_out.loc[a, 'Group'] = 'A'
df_out.loc[b1, 'Group'] = 'B1'
df_out.loc[b2, 'Group'] = 'B2'
df_out.loc[c, 'Group'] = 'C'
df_out.loc[r, 'Group'] = 'R'   # 重複的條目
df_out.loc[na, 'Group'] = 'X'  # 無法分類的條目
df_out.loc[f_t, 'FU_TW'] = 1   # 請參考「規則」Google Docs 的分類數字
df_out.loc[f_f, 'FU_TW'] = 2
df_out.loc[f_n, 'FU_TW'] = 0

In [22]:
# 更新 AUc_CO 和 AU_TW 的資訊
# 沒有註明通訊作者的文章，以第一作者為通訊作者
df_out.loc[237, 'AUc_CO'] = 'USA'
df_out.loc[237, 'AU_TW'] = 2
df_out.loc[238, 'AUc_CO'] = 'USA'
df_out.loc[238, 'AU_TW'] = 2
df_out.loc[257, 'AUc_CO'] = 'FRANCE'
df_out.loc[273, 'AUc_CO'] = 'TAIWAN'
df_out.loc[289, 'AUc_CO'] = 'TAIWAN'
df_out.loc[301, 'AUc_CO'] = 'TAIWAN'
df_out.loc[367, 'AUc_CO'] = 'TAIWAN'
df_out.loc[756, 'AUc_CO'] = 'TAIWAN'
df_out.loc[929, 'AUc_CO'] = 'USA'
df_out.loc[929, 'AU_TW'] = 3
# 修正 DOI 和奇怪的標題
df_out.loc[5,    'TI'] ='Terrestrial biomarker isotope records of late Quaternary climate and source-to-sink sediment transport processes in southwestern Taiwan'
df_out.loc[57  , 'DI'] = '10.2113/gssgfbull.180.2.155'
df_out.loc[61  , 'DI'] = '10.1007/s004450050241'
df_out.loc[187 , 'DI']  = '10.2113/gsecongeo.97.3.593'
df_out.loc[236 , 'DI'] = '10.1016/0016-7037(88)90037-3'
df_out.loc[238 , 'DI'] = '10.1016/0016-7037(90)90165-H'
df_out.loc[307 , 'DI'] = '10.1130/0091-7613(1998)026%3C0279:SRASSO%3E2.3.CO;2'
df_out.loc[308 , 'DI'] = '10.1130/0091-7613(2000)28%3C155:SBAAMF%3E2.0.CO;2'
df_out.loc[342 , 'DI']  = '10.22059/GEOPE.2020.301603.648548'
df_out.loc[364 , 'DI']  = '10.1680/gein.2003.10.1.2'
df_out.loc[455 , 'DI'] = '10.1016/S1001-6279(11)60103-0'
df_out.loc[502 , 'DI'] = '10.1023/A:1008282623697'
df_out.loc[509 , 'DI']  = 'www.sciencedirect.com/science/article/abs/pii/S0743954796000761'
df_out.loc[756 , 'DI']  = '10.11366/sokuchi1954.30.213'
df_out.loc[889 , 'DI']  = '10.3826/jhr.2008.3173'
df_out.loc[894 , 'DI'] = '10.1046/j.0263-4929.2001.00331.x'
df_out.loc[923 , 'DI'] = '10.1023/A:1008067928365'
df_out.loc[928 , 'DI'] = '10.1111/jpg.12748'
df_out.loc[948 , 'DI'] = '10.1016/0743-9547(94)00041-C'
df_out.loc[949 , 'DI'] = '10.1016/0743-9547(94)00040-L'
df_out.loc[974 , 'DI'] = '10.1016/0377-0273(95)00028-3'
df_out.loc[1248, 'DI'] = '10.1007/BF02907199'
df_out.loc[1254, 'DI'] = 'hdl.handle.net/10097/45241'
df_out.loc[1259, 'DI'] = '10.1046/j.1365-3091.1998.00175.x'
df_out.loc[1279, 'DI'] = '10.1179/sre.2002.36.286.568'
df_out.loc[1280, 'DI'] = '10.1179/sre.2002.36.284.423'
df_out.loc[1281, 'DI'] = '10.1179/003962603791482613'

In [23]:
df_out.to_csv('6.1331_TW_others_Zheng_done.csv', index=False)

In [24]:
# Extract single doi
'https://doi.org/' + df_out.loc[929, 'DI']

'https://doi.org/10.1093/petrology/16.1.80'